1/*
2 * Copyright 2010-2011, Ingo Weinhold, ingo_weinhold@gmx.de.
3 * Copyright 2002-2010, Axel D��rfler, axeld@pinc-software.de.
4 * Distributed under the terms of the MIT License.
5 *
6 * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
7 * Distributed under the terms of the NewOS License.
8 */
9
10
11#include <string.h>
12#include <stdlib.h>
13
14#include <algorithm>
15
16#include <KernelExport.h>
17#include <OS.h>
18
19#include <AutoDeleter.h>
20
21#include <arch/cpu.h>
22#include <arch/vm_translation_map.h>
23#include <block_cache.h>
24#include <boot/kernel_args.h>
25#include <condition_variable.h>
26#include <elf.h>
27#include <heap.h>
28#include <kernel.h>
29#include <low_resource_manager.h>
30#include <thread.h>
31#include <tracing.h>
32#include <util/AutoLock.h>
33#include <vfs.h>
34#include <vm/vm.h>
35#include <vm/vm_priv.h>
36#include <vm/vm_page.h>
37#include <vm/VMAddressSpace.h>
38#include <vm/VMArea.h>
39#include <vm/VMCache.h>
40
41#include "IORequest.h"
42#include "PageCacheLocker.h"
43#include "VMAnonymousCache.h"
44#include "VMPageQueue.h"
45
46
47//#define TRACE_VM_PAGE
48#ifdef TRACE_VM_PAGE
49#	define TRACE(x) dprintf x
50#else
51#	define TRACE(x) ;
52#endif
53
54//#define TRACE_VM_DAEMONS
55#ifdef TRACE_VM_DAEMONS
56#define TRACE_DAEMON(x...) dprintf(x)
57#else
58#define TRACE_DAEMON(x...) do {} while (false)
59#endif
60
61//#define TRACK_PAGE_USAGE_STATS	1
62
63#define PAGE_ASSERT(page, condition)	\
64	ASSERT_PRINT((condition), "page: %p", (page))
65
66#define SCRUB_SIZE 32
67	// this many pages will be cleared at once in the page scrubber thread
68
69#define MAX_PAGE_WRITER_IO_PRIORITY				B_URGENT_DISPLAY_PRIORITY
70	// maximum I/O priority of the page writer
71#define MAX_PAGE_WRITER_IO_PRIORITY_THRESHOLD	10000
72	// the maximum I/O priority shall be reached when this many pages need to
73	// be written
74
75
76// The page reserve an allocation of the certain priority must not touch.
77static const size_t kPageReserveForPriority[] = {
78	VM_PAGE_RESERVE_USER,		// user
79	VM_PAGE_RESERVE_SYSTEM,		// system
80	0							// VIP
81};
82
83// Minimum number of free pages the page daemon will try to achieve.
84static uint32 sFreePagesTarget;
85static uint32 sFreeOrCachedPagesTarget;
86static uint32 sInactivePagesTarget;
87
88// Wait interval between page daemon runs.
89static const bigtime_t kIdleScanWaitInterval = 1000000LL;	// 1 sec
90static const bigtime_t kBusyScanWaitInterval = 500000LL;	// 0.5 sec
91
92// Number of idle runs after which we want to have processed the full active
93// queue.
94static const uint32 kIdleRunsForFullQueue = 20;
95
96// Maximum limit for the vm_page::usage_count.
97static const int32 kPageUsageMax = 64;
98// vm_page::usage_count buff an accessed page receives in a scan.
99static const int32 kPageUsageAdvance = 3;
100// vm_page::usage_count debuff an unaccessed page receives in a scan.
101static const int32 kPageUsageDecline = 1;
102
103int32 gMappedPagesCount;
104
105static VMPageQueue sPageQueues[PAGE_STATE_COUNT];
106
107static VMPageQueue& sFreePageQueue = sPageQueues[PAGE_STATE_FREE];
108static VMPageQueue& sClearPageQueue = sPageQueues[PAGE_STATE_CLEAR];
109static VMPageQueue& sModifiedPageQueue = sPageQueues[PAGE_STATE_MODIFIED];
110static VMPageQueue& sInactivePageQueue = sPageQueues[PAGE_STATE_INACTIVE];
111static VMPageQueue& sActivePageQueue = sPageQueues[PAGE_STATE_ACTIVE];
112static VMPageQueue& sCachedPageQueue = sPageQueues[PAGE_STATE_CACHED];
113
114static vm_page *sPages;
115static page_num_t sPhysicalPageOffset;
116static page_num_t sNumPages;
117static page_num_t sNonExistingPages;
118	// pages in the sPages array that aren't backed by physical memory
119static uint64 sIgnoredPages;
120	// pages of physical memory ignored by the boot loader (and thus not
121	// available here)
122static int32 sUnreservedFreePages;
123static int32 sUnsatisfiedPageReservations;
124static int32 sModifiedTemporaryPages;
125
126static ConditionVariable sFreePageCondition;
127static mutex sPageDeficitLock = MUTEX_INITIALIZER("page deficit");
128
129// This lock must be used whenever the free or clear page queues are changed.
130// If you need to work on both queues at the same time, you need to hold a write
131// lock, otherwise, a read lock suffices (each queue still has a spinlock to
132// guard against concurrent changes).
133static rw_lock sFreePageQueuesLock
134	= RW_LOCK_INITIALIZER("free/clear page queues");
135
136#ifdef TRACK_PAGE_USAGE_STATS
137static page_num_t sPageUsageArrays[512];
138static page_num_t* sPageUsage = sPageUsageArrays;
139static page_num_t sPageUsagePageCount;
140static page_num_t* sNextPageUsage = sPageUsageArrays + 256;
141static page_num_t sNextPageUsagePageCount;
142#endif
143
144
145#if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
146
147struct caller_info {
148	addr_t		caller;
149	size_t		count;
150};
151
152static const int32 kCallerInfoTableSize = 1024;
153static caller_info sCallerInfoTable[kCallerInfoTableSize];
154static int32 sCallerInfoCount = 0;
155
156static caller_info* get_caller_info(addr_t caller);
157
158
159RANGE_MARKER_FUNCTION_PROTOTYPES(vm_page)
160
161static const addr_t kVMPageCodeAddressRange[] = {
162	RANGE_MARKER_FUNCTION_ADDRESS_RANGE(vm_page)
163};
164
165#endif
166
167
168RANGE_MARKER_FUNCTION_BEGIN(vm_page)
169
170
171struct page_stats {
172	int32	totalFreePages;
173	int32	unsatisfiedReservations;
174	int32	cachedPages;
175};
176
177
178struct PageReservationWaiter
179		: public DoublyLinkedListLinkImpl<PageReservationWaiter> {
180	Thread*	thread;
181	uint32	dontTouch;		// reserve not to touch
182	uint32	missing;		// pages missing for the reservation
183	int32	threadPriority;
184
185	bool operator<(const PageReservationWaiter& other) const
186	{
187		// Implies an order by descending VM priority (ascending dontTouch)
188		// and (secondarily) descending thread priority.
189		if (dontTouch != other.dontTouch)
190			return dontTouch < other.dontTouch;
191		return threadPriority > other.threadPriority;
192	}
193};
194
195typedef DoublyLinkedList<PageReservationWaiter> PageReservationWaiterList;
196static PageReservationWaiterList sPageReservationWaiters;
197
198
199struct DaemonCondition {
200	void Init(const char* name)
201	{
202		mutex_init(&fLock, "daemon condition");
203		fCondition.Init(this, name);
204		fActivated = false;
205	}
206
207	bool Lock()
208	{
209		return mutex_lock(&fLock) == B_OK;
210	}
211
212	void Unlock()
213	{
214		mutex_unlock(&fLock);
215	}
216
217	bool Wait(bigtime_t timeout, bool clearActivated)
218	{
219		MutexLocker locker(fLock);
220		if (clearActivated)
221			fActivated = false;
222		else if (fActivated)
223			return true;
224
225		ConditionVariableEntry entry;
226		fCondition.Add(&entry);
227
228		locker.Unlock();
229
230		return entry.Wait(B_RELATIVE_TIMEOUT, timeout) == B_OK;
231	}
232
233	void WakeUp()
234	{
235		if (fActivated)
236			return;
237
238		MutexLocker locker(fLock);
239		fActivated = true;
240		fCondition.NotifyOne();
241	}
242
243	void ClearActivated()
244	{
245		MutexLocker locker(fLock);
246		fActivated = false;
247	}
248
249private:
250	mutex				fLock;
251	ConditionVariable	fCondition;
252	bool				fActivated;
253};
254
255
256static DaemonCondition sPageWriterCondition;
257static DaemonCondition sPageDaemonCondition;
258
259
260#if PAGE_ALLOCATION_TRACING
261
262namespace PageAllocationTracing {
263
264class ReservePages : public AbstractTraceEntry {
265public:
266	ReservePages(uint32 count)
267		:
268		fCount(count)
269	{
270		Initialized();
271	}
272
273	virtual void AddDump(TraceOutput& out)
274	{
275		out.Print("page reserve:   %" B_PRIu32, fCount);
276	}
277
278private:
279	uint32		fCount;
280};
281
282
283class UnreservePages : public AbstractTraceEntry {
284public:
285	UnreservePages(uint32 count)
286		:
287		fCount(count)
288	{
289		Initialized();
290	}
291
292	virtual void AddDump(TraceOutput& out)
293	{
294		out.Print("page unreserve: %" B_PRId32, fCount);
295	}
296
297private:
298	uint32		fCount;
299};
300
301
302class AllocatePage
303	: public TRACE_ENTRY_SELECTOR(PAGE_ALLOCATION_TRACING_STACK_TRACE) {
304public:
305	AllocatePage(page_num_t pageNumber)
306		:
307		TraceEntryBase(PAGE_ALLOCATION_TRACING_STACK_TRACE, 0, true),
308		fPageNumber(pageNumber)
309	{
310		Initialized();
311	}
312
313	virtual void AddDump(TraceOutput& out)
314	{
315		out.Print("page alloc: %#" B_PRIxPHYSADDR, fPageNumber);
316	}
317
318private:
319	page_num_t	fPageNumber;
320};
321
322
323class AllocatePageRun
324	: public TRACE_ENTRY_SELECTOR(PAGE_ALLOCATION_TRACING_STACK_TRACE) {
325public:
326	AllocatePageRun(page_num_t startPage, uint32 length)
327		:
328		TraceEntryBase(PAGE_ALLOCATION_TRACING_STACK_TRACE, 0, true),
329		fStartPage(startPage),
330		fLength(length)
331	{
332		Initialized();
333	}
334
335	virtual void AddDump(TraceOutput& out)
336	{
337		out.Print("page alloc run: start %#" B_PRIxPHYSADDR " length: %"
338			B_PRIu32, fStartPage, fLength);
339	}
340
341private:
342	page_num_t	fStartPage;
343	uint32		fLength;
344};
345
346
347class FreePage
348	: public TRACE_ENTRY_SELECTOR(PAGE_ALLOCATION_TRACING_STACK_TRACE) {
349public:
350	FreePage(page_num_t pageNumber)
351		:
352		TraceEntryBase(PAGE_ALLOCATION_TRACING_STACK_TRACE, 0, true),
353		fPageNumber(pageNumber)
354	{
355		Initialized();
356	}
357
358	virtual void AddDump(TraceOutput& out)
359	{
360		out.Print("page free: %#" B_PRIxPHYSADDR, fPageNumber);
361	}
362
363private:
364	page_num_t	fPageNumber;
365};
366
367
368class ScrubbingPages : public AbstractTraceEntry {
369public:
370	ScrubbingPages(uint32 count)
371		:
372		fCount(count)
373	{
374		Initialized();
375	}
376
377	virtual void AddDump(TraceOutput& out)
378	{
379		out.Print("page scrubbing: %" B_PRId32, fCount);
380	}
381
382private:
383	uint32		fCount;
384};
385
386
387class ScrubbedPages : public AbstractTraceEntry {
388public:
389	ScrubbedPages(uint32 count)
390		:
391		fCount(count)
392	{
393		Initialized();
394	}
395
396	virtual void AddDump(TraceOutput& out)
397	{
398		out.Print("page scrubbed:  %" B_PRId32, fCount);
399	}
400
401private:
402	uint32		fCount;
403};
404
405
406class StolenPage : public AbstractTraceEntry {
407public:
408	StolenPage()
409	{
410		Initialized();
411	}
412
413	virtual void AddDump(TraceOutput& out)
414	{
415		out.Print("page stolen");
416	}
417};
418
419}	// namespace PageAllocationTracing
420
421#	define TA(x)	new(std::nothrow) PageAllocationTracing::x
422
423#else
424#	define TA(x)
425#endif	// PAGE_ALLOCATION_TRACING
426
427
428#if PAGE_DAEMON_TRACING
429
430namespace PageDaemonTracing {
431
432class ActivatePage : public AbstractTraceEntry {
433	public:
434		ActivatePage(vm_page* page)
435			:
436			fCache(page->cache),
437			fPage(page)
438		{
439			Initialized();
440		}
441
442		virtual void AddDump(TraceOutput& out)
443		{
444			out.Print("page activated:   %p, cache: %p", fPage, fCache);
445		}
446
447	private:
448		VMCache*	fCache;
449		vm_page*	fPage;
450};
451
452
453class DeactivatePage : public AbstractTraceEntry {
454	public:
455		DeactivatePage(vm_page* page)
456			:
457			fCache(page->cache),
458			fPage(page)
459		{
460			Initialized();
461		}
462
463		virtual void AddDump(TraceOutput& out)
464		{
465			out.Print("page deactivated: %p, cache: %p", fPage, fCache);
466		}
467
468	private:
469		VMCache*	fCache;
470		vm_page*	fPage;
471};
472
473
474class FreedPageSwap : public AbstractTraceEntry {
475	public:
476		FreedPageSwap(vm_page* page)
477			:
478			fCache(page->cache),
479			fPage(page)
480		{
481			Initialized();
482		}
483
484		virtual void AddDump(TraceOutput& out)
485		{
486			out.Print("page swap freed:  %p, cache: %p", fPage, fCache);
487		}
488
489	private:
490		VMCache*	fCache;
491		vm_page*	fPage;
492};
493
494}	// namespace PageDaemonTracing
495
496#	define TD(x)	new(std::nothrow) PageDaemonTracing::x
497
498#else
499#	define TD(x)
500#endif	// PAGE_DAEMON_TRACING
501
502
503#if PAGE_WRITER_TRACING
504
505namespace PageWriterTracing {
506
507class WritePage : public AbstractTraceEntry {
508	public:
509		WritePage(vm_page* page)
510			:
511			fCache(page->Cache()),
512			fPage(page)
513		{
514			Initialized();
515		}
516
517		virtual void AddDump(TraceOutput& out)
518		{
519			out.Print("page write: %p, cache: %p", fPage, fCache);
520		}
521
522	private:
523		VMCache*	fCache;
524		vm_page*	fPage;
525};
526
527}	// namespace PageWriterTracing
528
529#	define TPW(x)	new(std::nothrow) PageWriterTracing::x
530
531#else
532#	define TPW(x)
533#endif	// PAGE_WRITER_TRACING
534
535
536#if PAGE_STATE_TRACING
537
538namespace PageStateTracing {
539
540class SetPageState : public AbstractTraceEntry {
541	public:
542		SetPageState(vm_page* page, uint8 newState)
543			:
544			fPage(page),
545			fOldState(page->State()),
546			fNewState(newState),
547			fBusy(page->busy),
548			fWired(page->WiredCount() > 0),
549			fMapped(!page->mappings.IsEmpty()),
550			fAccessed(page->accessed),
551			fModified(page->modified)
552		{
553#if PAGE_STATE_TRACING_STACK_TRACE
554			fStackTrace = capture_tracing_stack_trace(
555				PAGE_STATE_TRACING_STACK_TRACE, 0, true);
556				// Don't capture userland stack trace to avoid potential
557				// deadlocks.
558#endif
559			Initialized();
560		}
561
562#if PAGE_STATE_TRACING_STACK_TRACE
563		virtual void DumpStackTrace(TraceOutput& out)
564		{
565			out.PrintStackTrace(fStackTrace);
566		}
567#endif
568
569		virtual void AddDump(TraceOutput& out)
570		{
571			out.Print("page set state: %p (%c%c%c%c%c): %s -> %s", fPage,
572				fBusy ? 'b' : '-',
573				fWired ? 'w' : '-',
574				fMapped ? 'm' : '-',
575				fAccessed ? 'a' : '-',
576				fModified ? 'm' : '-',
577				page_state_to_string(fOldState),
578				page_state_to_string(fNewState));
579		}
580
581	private:
582		vm_page*	fPage;
583#if PAGE_STATE_TRACING_STACK_TRACE
584		tracing_stack_trace* fStackTrace;
585#endif
586		uint8		fOldState;
587		uint8		fNewState;
588		bool		fBusy : 1;
589		bool		fWired : 1;
590		bool		fMapped : 1;
591		bool		fAccessed : 1;
592		bool		fModified : 1;
593};
594
595}	// namespace PageStateTracing
596
597#	define TPS(x)	new(std::nothrow) PageStateTracing::x
598
599#else
600#	define TPS(x)
601#endif	// PAGE_STATE_TRACING
602
603
604#if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
605
606namespace BKernel {
607
608class AllocationTrackingCallback {
609public:
610	virtual						~AllocationTrackingCallback();
611
612	virtual	bool				ProcessTrackingInfo(
613									AllocationTrackingInfo* info,
614									page_num_t pageNumber) = 0;
615};
616
617}
618
619using BKernel::AllocationTrackingCallback;
620
621
622class AllocationCollectorCallback : public AllocationTrackingCallback {
623public:
624	AllocationCollectorCallback(bool resetInfos)
625		:
626		fResetInfos(resetInfos)
627	{
628	}
629
630	virtual bool ProcessTrackingInfo(AllocationTrackingInfo* info,
631		page_num_t pageNumber)
632	{
633		if (!info->IsInitialized())
634			return true;
635
636		addr_t caller = 0;
637		AbstractTraceEntryWithStackTrace* traceEntry = info->TraceEntry();
638
639		if (traceEntry != NULL && info->IsTraceEntryValid()) {
640			caller = tracing_find_caller_in_stack_trace(
641				traceEntry->StackTrace(), kVMPageCodeAddressRange, 1);
642		}
643
644		caller_info* callerInfo = get_caller_info(caller);
645		if (callerInfo == NULL) {
646			kprintf("out of space for caller infos\n");
647			return false;
648		}
649
650		callerInfo->count++;
651
652		if (fResetInfos)
653			info->Clear();
654
655		return true;
656	}
657
658private:
659	bool	fResetInfos;
660};
661
662
663class AllocationInfoPrinterCallback : public AllocationTrackingCallback {
664public:
665	AllocationInfoPrinterCallback(bool printStackTrace, page_num_t pageFilter,
666		team_id teamFilter, thread_id threadFilter)
667		:
668		fPrintStackTrace(printStackTrace),
669		fPageFilter(pageFilter),
670		fTeamFilter(teamFilter),
671		fThreadFilter(threadFilter)
672	{
673	}
674
675	virtual bool ProcessTrackingInfo(AllocationTrackingInfo* info,
676		page_num_t pageNumber)
677	{
678		if (!info->IsInitialized())
679			return true;
680
681		if (fPageFilter != 0 && pageNumber != fPageFilter)
682			return true;
683
684		AbstractTraceEntryWithStackTrace* traceEntry = info->TraceEntry();
685		if (traceEntry != NULL && !info->IsTraceEntryValid())
686			traceEntry = NULL;
687
688		if (traceEntry != NULL) {
689			if (fTeamFilter != -1 && traceEntry->TeamID() != fTeamFilter)
690				return true;
691			if (fThreadFilter != -1 && traceEntry->ThreadID() != fThreadFilter)
692				return true;
693		} else {
694			// we need the info if we have filters set
695			if (fTeamFilter != -1 || fThreadFilter != -1)
696				return true;
697		}
698
699		kprintf("page number %#" B_PRIxPHYSADDR, pageNumber);
700
701		if (traceEntry != NULL) {
702			kprintf(", team: %" B_PRId32 ", thread %" B_PRId32
703				", time %" B_PRId64 "\n", traceEntry->TeamID(),
704				traceEntry->ThreadID(), traceEntry->Time());
705
706			if (fPrintStackTrace)
707				tracing_print_stack_trace(traceEntry->StackTrace());
708		} else
709			kprintf("\n");
710
711		return true;
712	}
713
714private:
715	bool		fPrintStackTrace;
716	page_num_t	fPageFilter;
717	team_id		fTeamFilter;
718	thread_id	fThreadFilter;
719};
720
721
722class AllocationDetailPrinterCallback : public AllocationTrackingCallback {
723public:
724	AllocationDetailPrinterCallback(addr_t caller)
725		:
726		fCaller(caller)
727	{
728	}
729
730	virtual bool ProcessTrackingInfo(AllocationTrackingInfo* info,
731		page_num_t pageNumber)
732	{
733		if (!info->IsInitialized())
734			return true;
735
736		addr_t caller = 0;
737		AbstractTraceEntryWithStackTrace* traceEntry = info->TraceEntry();
738		if (traceEntry != NULL && !info->IsTraceEntryValid())
739			traceEntry = NULL;
740
741		if (traceEntry != NULL) {
742			caller = tracing_find_caller_in_stack_trace(
743				traceEntry->StackTrace(), kVMPageCodeAddressRange, 1);
744		}
745
746		if (caller != fCaller)
747			return true;
748
749		kprintf("page %#" B_PRIxPHYSADDR "\n", pageNumber);
750		if (traceEntry != NULL)
751			tracing_print_stack_trace(traceEntry->StackTrace());
752
753		return true;
754	}
755
756private:
757	addr_t	fCaller;
758};
759
760#endif	// VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
761
762
763static void
764list_page(vm_page* page)
765{
766	kprintf("0x%08" B_PRIxADDR " ",
767		(addr_t)(page->physical_page_number * B_PAGE_SIZE));
768	switch (page->State()) {
769		case PAGE_STATE_ACTIVE:   kprintf("A"); break;
770		case PAGE_STATE_INACTIVE: kprintf("I"); break;
771		case PAGE_STATE_MODIFIED: kprintf("M"); break;
772		case PAGE_STATE_CACHED:   kprintf("C"); break;
773		case PAGE_STATE_FREE:     kprintf("F"); break;
774		case PAGE_STATE_CLEAR:    kprintf("L"); break;
775		case PAGE_STATE_WIRED:    kprintf("W"); break;
776		case PAGE_STATE_UNUSED:   kprintf("-"); break;
777	}
778	kprintf(" ");
779	if (page->busy)         kprintf("B"); else kprintf("-");
780	if (page->busy_writing) kprintf("W"); else kprintf("-");
781	if (page->accessed)     kprintf("A"); else kprintf("-");
782	if (page->modified)     kprintf("M"); else kprintf("-");
783	if (page->unused)       kprintf("U"); else kprintf("-");
784
785	kprintf(" usage:%3u", page->usage_count);
786	kprintf(" wired:%5u", page->WiredCount());
787
788	bool first = true;
789	vm_page_mappings::Iterator iterator = page->mappings.GetIterator();
790	vm_page_mapping* mapping;
791	while ((mapping = iterator.Next()) != NULL) {
792		if (first) {
793			kprintf(": ");
794			first = false;
795		} else
796			kprintf(", ");
797
798		kprintf("%" B_PRId32 " (%s)", mapping->area->id, mapping->area->name);
799		mapping = mapping->page_link.next;
800	}
801}
802
803
804static int
805dump_page_list(int argc, char **argv)
806{
807	kprintf("page table:\n");
808	for (page_num_t i = 0; i < sNumPages; i++) {
809		if (sPages[i].State() != PAGE_STATE_UNUSED) {
810			list_page(&sPages[i]);
811			kprintf("\n");
812		}
813	}
814	kprintf("end of page table\n");
815
816	return 0;
817}
818
819
820static int
821find_page(int argc, char **argv)
822{
823	struct vm_page *page;
824	addr_t address;
825	int32 index = 1;
826	int i;
827
828	struct {
829		const char*	name;
830		VMPageQueue*	queue;
831	} pageQueueInfos[] = {
832		{ "free",		&sFreePageQueue },
833		{ "clear",		&sClearPageQueue },
834		{ "modified",	&sModifiedPageQueue },
835		{ "active",		&sActivePageQueue },
836		{ "inactive",	&sInactivePageQueue },
837		{ "cached",		&sCachedPageQueue },
838		{ NULL, NULL }
839	};
840
841	if (argc < 2
842		|| strlen(argv[index]) <= 2
843		|| argv[index][0] != '0'
844		|| argv[index][1] != 'x') {
845		kprintf("usage: find_page <address>\n");
846		return 0;
847	}
848
849	address = strtoul(argv[index], NULL, 0);
850	page = (vm_page*)address;
851
852	for (i = 0; pageQueueInfos[i].name; i++) {
853		VMPageQueue::Iterator it = pageQueueInfos[i].queue->GetIterator();
854		while (vm_page* p = it.Next()) {
855			if (p == page) {
856				kprintf("found page %p in queue %p (%s)\n", page,
857					pageQueueInfos[i].queue, pageQueueInfos[i].name);
858				return 0;
859			}
860		}
861	}
862
863	kprintf("page %p isn't in any queue\n", page);
864
865	return 0;
866}
867
868
869const char *
870page_state_to_string(int state)
871{
872	switch(state) {
873		case PAGE_STATE_ACTIVE:
874			return "active";
875		case PAGE_STATE_INACTIVE:
876			return "inactive";
877		case PAGE_STATE_MODIFIED:
878			return "modified";
879		case PAGE_STATE_CACHED:
880			return "cached";
881		case PAGE_STATE_FREE:
882			return "free";
883		case PAGE_STATE_CLEAR:
884			return "clear";
885		case PAGE_STATE_WIRED:
886			return "wired";
887		case PAGE_STATE_UNUSED:
888			return "unused";
889		default:
890			return "unknown";
891	}
892}
893
894
895static int
896dump_page_long(int argc, char **argv)
897{
898	bool addressIsPointer = true;
899	bool physical = false;
900	bool searchMappings = false;
901	int32 index = 1;
902
903	while (index < argc) {
904		if (argv[index][0] != '-')
905			break;
906
907		if (!strcmp(argv[index], "-p")) {
908			addressIsPointer = false;
909			physical = true;
910		} else if (!strcmp(argv[index], "-v")) {
911			addressIsPointer = false;
912		} else if (!strcmp(argv[index], "-m")) {
913			searchMappings = true;
914		} else {
915			print_debugger_command_usage(argv[0]);
916			return 0;
917		}
918
919		index++;
920	}
921
922	if (index + 1 != argc) {
923		print_debugger_command_usage(argv[0]);
924		return 0;
925	}
926
927	uint64 value;
928	if (!evaluate_debug_expression(argv[index], &value, false))
929		return 0;
930
931	uint64 pageAddress = value;
932	struct vm_page* page;
933
934	if (addressIsPointer) {
935		page = (struct vm_page *)(addr_t)pageAddress;
936	} else {
937		if (!physical) {
938			VMAddressSpace *addressSpace = VMAddressSpace::Kernel();
939
940			if (debug_get_debugged_thread()->team->address_space != NULL)
941				addressSpace = debug_get_debugged_thread()->team->address_space;
942
943			uint32 flags = 0;
944			phys_addr_t physicalAddress;
945			if (addressSpace->TranslationMap()->QueryInterrupt(pageAddress,
946					&physicalAddress, &flags) != B_OK
947				|| (flags & PAGE_PRESENT) == 0) {
948				kprintf("Virtual address not mapped to a physical page in this "
949					"address space.\n");
950				return 0;
951			}
952			pageAddress = physicalAddress;
953		}
954
955		page = vm_lookup_page(pageAddress / B_PAGE_SIZE);
956	}
957
958	kprintf("PAGE: %p\n", page);
959	kprintf("queue_next,prev: %p, %p\n", page->queue_link.next,
960		page->queue_link.previous);
961	kprintf("physical_number: %#" B_PRIxPHYSADDR "\n",
962		page->physical_page_number);
963	kprintf("cache:           %p\n", page->Cache());
964	kprintf("cache_offset:    %" B_PRIuPHYSADDR "\n", page->cache_offset);
965	kprintf("cache_next:      %p\n", page->cache_next);
966	kprintf("state:           %s\n", page_state_to_string(page->State()));
967	kprintf("wired_count:     %d\n", page->WiredCount());
968	kprintf("usage_count:     %d\n", page->usage_count);
969	kprintf("busy:            %d\n", page->busy);
970	kprintf("busy_writing:    %d\n", page->busy_writing);
971	kprintf("accessed:        %d\n", page->accessed);
972	kprintf("modified:        %d\n", page->modified);
973	#if DEBUG_PAGE_QUEUE
974		kprintf("queue:           %p\n", page->queue);
975	#endif
976	#if DEBUG_PAGE_ACCESS
977		kprintf("accessor:        %" B_PRId32 "\n", page->accessing_thread);
978	#endif
979	kprintf("area mappings:\n");
980
981	vm_page_mappings::Iterator iterator = page->mappings.GetIterator();
982	vm_page_mapping *mapping;
983	while ((mapping = iterator.Next()) != NULL) {
984		kprintf("  %p (%" B_PRId32 ")\n", mapping->area, mapping->area->id);
985		mapping = mapping->page_link.next;
986	}
987
988	if (searchMappings) {
989		kprintf("all mappings:\n");
990		VMAddressSpace* addressSpace = VMAddressSpace::DebugFirst();
991		while (addressSpace != NULL) {
992			size_t pageCount = addressSpace->Size() / B_PAGE_SIZE;
993			for (addr_t address = addressSpace->Base(); pageCount != 0;
994					address += B_PAGE_SIZE, pageCount--) {
995				phys_addr_t physicalAddress;
996				uint32 flags = 0;
997				if (addressSpace->TranslationMap()->QueryInterrupt(address,
998						&physicalAddress, &flags) == B_OK
999					&& (flags & PAGE_PRESENT) != 0
1000					&& physicalAddress / B_PAGE_SIZE
1001						== page->physical_page_number) {
1002					VMArea* area = addressSpace->LookupArea(address);
1003					kprintf("  aspace %" B_PRId32 ", area %" B_PRId32 ": %#"
1004						B_PRIxADDR " (%c%c%s%s)\n", addressSpace->ID(),
1005						area != NULL ? area->id : -1, address,
1006						(flags & B_KERNEL_READ_AREA) != 0 ? 'r' : '-',
1007						(flags & B_KERNEL_WRITE_AREA) != 0 ? 'w' : '-',
1008						(flags & PAGE_MODIFIED) != 0 ? " modified" : "",
1009						(flags & PAGE_ACCESSED) != 0 ? " accessed" : "");
1010				}
1011			}
1012			addressSpace = VMAddressSpace::DebugNext(addressSpace);
1013		}
1014	}
1015
1016	set_debug_variable("_cache", (addr_t)page->Cache());
1017	#if DEBUG_PAGE_ACCESS
1018		set_debug_variable("_accessor", page->accessing_thread);
1019	#endif
1020
1021	return 0;
1022}
1023
1024
1025static int
1026dump_page_queue(int argc, char **argv)
1027{
1028	struct VMPageQueue *queue;
1029
1030	if (argc < 2) {
1031		kprintf("usage: page_queue <address/name> [list]\n");
1032		return 0;
1033	}
1034
1035	if (strlen(argv[1]) >= 2 && argv[1][0] == '0' && argv[1][1] == 'x')
1036		queue = (VMPageQueue*)strtoul(argv[1], NULL, 16);
1037	else if (!strcmp(argv[1], "free"))
1038		queue = &sFreePageQueue;
1039	else if (!strcmp(argv[1], "clear"))
1040		queue = &sClearPageQueue;
1041	else if (!strcmp(argv[1], "modified"))
1042		queue = &sModifiedPageQueue;
1043	else if (!strcmp(argv[1], "active"))
1044		queue = &sActivePageQueue;
1045	else if (!strcmp(argv[1], "inactive"))
1046		queue = &sInactivePageQueue;
1047	else if (!strcmp(argv[1], "cached"))
1048		queue = &sCachedPageQueue;
1049	else {
1050		kprintf("page_queue: unknown queue \"%s\".\n", argv[1]);
1051		return 0;
1052	}
1053
1054	kprintf("queue = %p, queue->head = %p, queue->tail = %p, queue->count = %"
1055		B_PRIuPHYSADDR "\n", queue, queue->Head(), queue->Tail(),
1056		queue->Count());
1057
1058	if (argc == 3) {
1059		struct vm_page *page = queue->Head();
1060
1061		kprintf("page        cache       type       state  wired  usage\n");
1062		for (page_num_t i = 0; page; i++, page = queue->Next(page)) {
1063			kprintf("%p  %p  %-7s %8s  %5d  %5d\n", page, page->Cache(),
1064				vm_cache_type_to_string(page->Cache()->type),
1065				page_state_to_string(page->State()),
1066				page->WiredCount(), page->usage_count);
1067		}
1068	}
1069	return 0;
1070}
1071
1072
1073static int
1074dump_page_stats(int argc, char **argv)
1075{
1076	page_num_t swappableModified = 0;
1077	page_num_t swappableModifiedInactive = 0;
1078
1079	size_t counter[8];
1080	size_t busyCounter[8];
1081	memset(counter, 0, sizeof(counter));
1082	memset(busyCounter, 0, sizeof(busyCounter));
1083
1084	struct page_run {
1085		page_num_t	start;
1086		page_num_t	end;
1087
1088		page_num_t Length() const	{ return end - start; }
1089	};
1090
1091	page_run currentFreeRun = { 0, 0 };
1092	page_run currentCachedRun = { 0, 0 };
1093	page_run longestFreeRun = { 0, 0 };
1094	page_run longestCachedRun = { 0, 0 };
1095
1096	for (page_num_t i = 0; i < sNumPages; i++) {
1097		if (sPages[i].State() > 7) {
1098			panic("page %" B_PRIuPHYSADDR " at %p has invalid state!\n", i,
1099				&sPages[i]);
1100		}
1101
1102		uint32 pageState = sPages[i].State();
1103
1104		counter[pageState]++;
1105		if (sPages[i].busy)
1106			busyCounter[pageState]++;
1107
1108		if (pageState == PAGE_STATE_MODIFIED
1109			&& sPages[i].Cache() != NULL
1110			&& sPages[i].Cache()->temporary && sPages[i].WiredCount() == 0) {
1111			swappableModified++;
1112			if (sPages[i].usage_count == 0)
1113				swappableModifiedInactive++;
1114		}
1115
1116		// track free and cached pages runs
1117		if (pageState == PAGE_STATE_FREE || pageState == PAGE_STATE_CLEAR) {
1118			currentFreeRun.end = i + 1;
1119			currentCachedRun.end = i + 1;
1120		} else {
1121			if (currentFreeRun.Length() > longestFreeRun.Length())
1122				longestFreeRun = currentFreeRun;
1123			currentFreeRun.start = currentFreeRun.end = i + 1;
1124
1125			if (pageState == PAGE_STATE_CACHED) {
1126				currentCachedRun.end = i + 1;
1127			} else {
1128				if (currentCachedRun.Length() > longestCachedRun.Length())
1129					longestCachedRun = currentCachedRun;
1130				currentCachedRun.start = currentCachedRun.end = i + 1;
1131			}
1132		}
1133	}
1134
1135	kprintf("page stats:\n");
1136	kprintf("total: %" B_PRIuPHYSADDR "\n", sNumPages);
1137
1138	kprintf("active: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1139		counter[PAGE_STATE_ACTIVE], busyCounter[PAGE_STATE_ACTIVE]);
1140	kprintf("inactive: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1141		counter[PAGE_STATE_INACTIVE], busyCounter[PAGE_STATE_INACTIVE]);
1142	kprintf("cached: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1143		counter[PAGE_STATE_CACHED], busyCounter[PAGE_STATE_CACHED]);
1144	kprintf("unused: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1145		counter[PAGE_STATE_UNUSED], busyCounter[PAGE_STATE_UNUSED]);
1146	kprintf("wired: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1147		counter[PAGE_STATE_WIRED], busyCounter[PAGE_STATE_WIRED]);
1148	kprintf("modified: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1149		counter[PAGE_STATE_MODIFIED], busyCounter[PAGE_STATE_MODIFIED]);
1150	kprintf("free: %" B_PRIuSIZE "\n", counter[PAGE_STATE_FREE]);
1151	kprintf("clear: %" B_PRIuSIZE "\n", counter[PAGE_STATE_CLEAR]);
1152
1153	kprintf("unreserved free pages: %" B_PRId32 "\n", sUnreservedFreePages);
1154	kprintf("unsatisfied page reservations: %" B_PRId32 "\n",
1155		sUnsatisfiedPageReservations);
1156	kprintf("mapped pages: %" B_PRId32 "\n", gMappedPagesCount);
1157	kprintf("longest free pages run: %" B_PRIuPHYSADDR " pages (at %"
1158		B_PRIuPHYSADDR ")\n", longestFreeRun.Length(),
1159		sPages[longestFreeRun.start].physical_page_number);
1160	kprintf("longest free/cached pages run: %" B_PRIuPHYSADDR " pages (at %"
1161		B_PRIuPHYSADDR ")\n", longestCachedRun.Length(),
1162		sPages[longestCachedRun.start].physical_page_number);
1163
1164	kprintf("waiting threads:\n");
1165	for (PageReservationWaiterList::Iterator it
1166			= sPageReservationWaiters.GetIterator();
1167		PageReservationWaiter* waiter = it.Next();) {
1168		kprintf("  %6" B_PRId32 ": missing: %6" B_PRIu32
1169			", don't touch: %6" B_PRIu32 "\n", waiter->thread->id,
1170			waiter->missing, waiter->dontTouch);
1171	}
1172
1173	kprintf("\nfree queue: %p, count = %" B_PRIuPHYSADDR "\n", &sFreePageQueue,
1174		sFreePageQueue.Count());
1175	kprintf("clear queue: %p, count = %" B_PRIuPHYSADDR "\n", &sClearPageQueue,
1176		sClearPageQueue.Count());
1177	kprintf("modified queue: %p, count = %" B_PRIuPHYSADDR " (%" B_PRId32
1178		" temporary, %" B_PRIuPHYSADDR " swappable, " "inactive: %"
1179		B_PRIuPHYSADDR ")\n", &sModifiedPageQueue, sModifiedPageQueue.Count(),
1180		sModifiedTemporaryPages, swappableModified, swappableModifiedInactive);
1181	kprintf("active queue: %p, count = %" B_PRIuPHYSADDR "\n",
1182		&sActivePageQueue, sActivePageQueue.Count());
1183	kprintf("inactive queue: %p, count = %" B_PRIuPHYSADDR "\n",
1184		&sInactivePageQueue, sInactivePageQueue.Count());
1185	kprintf("cached queue: %p, count = %" B_PRIuPHYSADDR "\n",
1186		&sCachedPageQueue, sCachedPageQueue.Count());
1187	return 0;
1188}
1189
1190
1191#if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
1192
1193static caller_info*
1194get_caller_info(addr_t caller)
1195{
1196	// find the caller info
1197	for (int32 i = 0; i < sCallerInfoCount; i++) {
1198		if (caller == sCallerInfoTable[i].caller)
1199			return &sCallerInfoTable[i];
1200	}
1201
1202	// not found, add a new entry, if there are free slots
1203	if (sCallerInfoCount >= kCallerInfoTableSize)
1204		return NULL;
1205
1206	caller_info* info = &sCallerInfoTable[sCallerInfoCount++];
1207	info->caller = caller;
1208	info->count = 0;
1209
1210	return info;
1211}
1212
1213
1214static int
1215caller_info_compare_count(const void* _a, const void* _b)
1216{
1217	const caller_info* a = (const caller_info*)_a;
1218	const caller_info* b = (const caller_info*)_b;
1219	return (int)(b->count - a->count);
1220}
1221
1222
1223static int
1224dump_page_allocations_per_caller(int argc, char** argv)
1225{
1226	bool resetAllocationInfos = false;
1227	bool printDetails = false;
1228	addr_t caller = 0;
1229
1230	for (int32 i = 1; i < argc; i++) {
1231		if (strcmp(argv[i], "-d") == 0) {
1232			uint64 callerAddress;
1233			if (++i >= argc
1234				|| !evaluate_debug_expression(argv[i], &callerAddress, true)) {
1235				print_debugger_command_usage(argv[0]);
1236				return 0;
1237			}
1238
1239			caller = callerAddress;
1240			printDetails = true;
1241		} else if (strcmp(argv[i], "-r") == 0) {
1242			resetAllocationInfos = true;
1243		} else {
1244			print_debugger_command_usage(argv[0]);
1245			return 0;
1246		}
1247	}
1248
1249	sCallerInfoCount = 0;
1250
1251	AllocationCollectorCallback collectorCallback(resetAllocationInfos);
1252	AllocationDetailPrinterCallback detailsCallback(caller);
1253	AllocationTrackingCallback& callback = printDetails
1254		? (AllocationTrackingCallback&)detailsCallback
1255		: (AllocationTrackingCallback&)collectorCallback;
1256
1257	for (page_num_t i = 0; i < sNumPages; i++)
1258		callback.ProcessTrackingInfo(&sPages[i].allocation_tracking_info, i);
1259
1260	if (printDetails)
1261		return 0;
1262
1263	// sort the array
1264	qsort(sCallerInfoTable, sCallerInfoCount, sizeof(caller_info),
1265		&caller_info_compare_count);
1266
1267	kprintf("%" B_PRId32 " different callers\n\n", sCallerInfoCount);
1268
1269	size_t totalAllocationCount = 0;
1270
1271	kprintf("     count      caller\n");
1272	kprintf("----------------------------------\n");
1273	for (int32 i = 0; i < sCallerInfoCount; i++) {
1274		caller_info& info = sCallerInfoTable[i];
1275		kprintf("%10" B_PRIuSIZE "  %p", info.count, (void*)info.caller);
1276
1277		const char* symbol;
1278		const char* imageName;
1279		bool exactMatch;
1280		addr_t baseAddress;
1281
1282		if (elf_debug_lookup_symbol_address(info.caller, &baseAddress, &symbol,
1283				&imageName, &exactMatch) == B_OK) {
1284			kprintf("  %s + %#" B_PRIxADDR " (%s)%s\n", symbol,
1285				info.caller - baseAddress, imageName,
1286				exactMatch ? "" : " (nearest)");
1287		} else
1288			kprintf("\n");
1289
1290		totalAllocationCount += info.count;
1291	}
1292
1293	kprintf("\ntotal page allocations: %" B_PRIuSIZE "\n",
1294		totalAllocationCount);
1295
1296	return 0;
1297}
1298
1299
1300static int
1301dump_page_allocation_infos(int argc, char** argv)
1302{
1303	page_num_t pageFilter = 0;
1304	team_id teamFilter = -1;
1305	thread_id threadFilter = -1;
1306	bool printStackTraces = false;
1307
1308	for (int32 i = 1; i < argc; i++) {
1309		if (strcmp(argv[i], "--stacktrace") == 0)
1310			printStackTraces = true;
1311		else if (strcmp(argv[i], "-p") == 0) {
1312			uint64 pageNumber;
1313			if (++i >= argc
1314				|| !evaluate_debug_expression(argv[i], &pageNumber, true)) {
1315				print_debugger_command_usage(argv[0]);
1316				return 0;
1317			}
1318
1319			pageFilter = pageNumber;
1320		} else if (strcmp(argv[i], "--team") == 0) {
1321			uint64 team;
1322			if (++i >= argc
1323				|| !evaluate_debug_expression(argv[i], &team, true)) {
1324				print_debugger_command_usage(argv[0]);
1325				return 0;
1326			}
1327
1328			teamFilter = team;
1329		} else if (strcmp(argv[i], "--thread") == 0) {
1330			uint64 thread;
1331			if (++i >= argc
1332				|| !evaluate_debug_expression(argv[i], &thread, true)) {
1333				print_debugger_command_usage(argv[0]);
1334				return 0;
1335			}
1336
1337			threadFilter = thread;
1338		} else {
1339			print_debugger_command_usage(argv[0]);
1340			return 0;
1341		}
1342	}
1343
1344	AllocationInfoPrinterCallback callback(printStackTraces, pageFilter,
1345		teamFilter, threadFilter);
1346
1347	for (page_num_t i = 0; i < sNumPages; i++)
1348		callback.ProcessTrackingInfo(&sPages[i].allocation_tracking_info, i);
1349
1350	return 0;
1351}
1352
1353#endif	// VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
1354
1355
1356#ifdef TRACK_PAGE_USAGE_STATS
1357
1358static void
1359track_page_usage(vm_page* page)
1360{
1361	if (page->WiredCount() == 0) {
1362		sNextPageUsage[(int32)page->usage_count + 128]++;
1363		sNextPageUsagePageCount++;
1364	}
1365}
1366
1367
1368static void
1369update_page_usage_stats()
1370{
1371	std::swap(sPageUsage, sNextPageUsage);
1372	sPageUsagePageCount = sNextPageUsagePageCount;
1373
1374	memset(sNextPageUsage, 0, sizeof(page_num_t) * 256);
1375	sNextPageUsagePageCount = 0;
1376
1377	// compute average
1378	if (sPageUsagePageCount > 0) {
1379		int64 sum = 0;
1380		for (int32 i = 0; i < 256; i++)
1381			sum += (int64)sPageUsage[i] * (i - 128);
1382
1383		TRACE_DAEMON("average page usage: %f (%lu pages)\n",
1384			(float)sum / sPageUsagePageCount, sPageUsagePageCount);
1385	}
1386}
1387
1388
1389static int
1390dump_page_usage_stats(int argc, char** argv)
1391{
1392	kprintf("distribution of page usage counts (%lu pages):",
1393		sPageUsagePageCount);
1394
1395	int64 sum = 0;
1396	for (int32 i = 0; i < 256; i++) {
1397		if (i % 8 == 0)
1398			kprintf("\n%4ld:", i - 128);
1399
1400		int64 count = sPageUsage[i];
1401		sum += count * (i - 128);
1402
1403		kprintf("  %9llu", count);
1404	}
1405
1406	kprintf("\n\n");
1407
1408	kprintf("average usage count: %f\n",
1409		sPageUsagePageCount > 0 ? (float)sum / sPageUsagePageCount : 0);
1410
1411	return 0;
1412}
1413
1414#endif	// TRACK_PAGE_USAGE_STATS
1415
1416
1417// #pragma mark - vm_page
1418
1419
1420inline void
1421vm_page::InitState(uint8 newState)
1422{
1423	state = newState;
1424}
1425
1426
1427inline void
1428vm_page::SetState(uint8 newState)
1429{
1430	TPS(SetPageState(this, newState));
1431
1432	state = newState;
1433}
1434
1435
1436// #pragma mark -
1437
1438
1439static void
1440get_page_stats(page_stats& _pageStats)
1441{
1442	_pageStats.totalFreePages = sUnreservedFreePages;
1443	_pageStats.cachedPages = sCachedPageQueue.Count();
1444	_pageStats.unsatisfiedReservations = sUnsatisfiedPageReservations;
1445	// TODO: We don't get an actual snapshot here!
1446}
1447
1448
1449static bool
1450do_active_paging(const page_stats& pageStats)
1451{
1452	return pageStats.totalFreePages + pageStats.cachedPages
1453		< pageStats.unsatisfiedReservations
1454			+ (int32)sFreeOrCachedPagesTarget;
1455}
1456
1457
1458/*!	Reserves as many pages as possible from \c sUnreservedFreePages up to
1459	\a count. Doesn't touch the last \a dontTouch pages of
1460	\c sUnreservedFreePages, though.
1461	\return The number of actually reserved pages.
1462*/
1463static uint32
1464reserve_some_pages(uint32 count, uint32 dontTouch)
1465{
1466	while (true) {
1467		int32 freePages = atomic_get(&sUnreservedFreePages);
1468		if (freePages <= (int32)dontTouch)
1469			return 0;
1470
1471		int32 toReserve = std::min(count, freePages - dontTouch);
1472		if (atomic_test_and_set(&sUnreservedFreePages,
1473					freePages - toReserve, freePages)
1474				== freePages) {
1475			return toReserve;
1476		}
1477
1478		// the count changed in the meantime -- retry
1479	}
1480}
1481
1482
1483static void
1484wake_up_page_reservation_waiters()
1485{
1486	MutexLocker pageDeficitLocker(sPageDeficitLock);
1487
1488	// TODO: If this is a low priority thread, we might want to disable
1489	// interrupts or otherwise ensure that we aren't unscheduled. Otherwise
1490	// high priority threads wait be kept waiting while a medium priority thread
1491	// prevents us from running.
1492
1493	while (PageReservationWaiter* waiter = sPageReservationWaiters.Head()) {
1494		int32 reserved = reserve_some_pages(waiter->missing,
1495			waiter->dontTouch);
1496		if (reserved == 0)
1497			return;
1498
1499		atomic_add(&sUnsatisfiedPageReservations, -reserved);
1500		waiter->missing -= reserved;
1501
1502		if (waiter->missing > 0)
1503			return;
1504
1505		sPageReservationWaiters.Remove(waiter);
1506
1507		thread_unblock(waiter->thread, B_OK);
1508	}
1509}
1510
1511
1512static inline void
1513unreserve_pages(uint32 count)
1514{
1515	atomic_add(&sUnreservedFreePages, count);
1516	if (atomic_get(&sUnsatisfiedPageReservations) != 0)
1517		wake_up_page_reservation_waiters();
1518}
1519
1520
1521static void
1522free_page(vm_page* page, bool clear)
1523{
1524	DEBUG_PAGE_ACCESS_CHECK(page);
1525
1526	PAGE_ASSERT(page, !page->IsMapped());
1527
1528	VMPageQueue* fromQueue;
1529
1530	switch (page->State()) {
1531		case PAGE_STATE_ACTIVE:
1532			fromQueue = &sActivePageQueue;
1533			break;
1534		case PAGE_STATE_INACTIVE:
1535			fromQueue = &sInactivePageQueue;
1536			break;
1537		case PAGE_STATE_MODIFIED:
1538			fromQueue = &sModifiedPageQueue;
1539			break;
1540		case PAGE_STATE_CACHED:
1541			fromQueue = &sCachedPageQueue;
1542			break;
1543		case PAGE_STATE_FREE:
1544		case PAGE_STATE_CLEAR:
1545			panic("free_page(): page %p already free", page);
1546			return;
1547		case PAGE_STATE_WIRED:
1548		case PAGE_STATE_UNUSED:
1549			fromQueue = NULL;
1550			break;
1551		default:
1552			panic("free_page(): page %p in invalid state %d",
1553				page, page->State());
1554			return;
1555	}
1556
1557	if (page->CacheRef() != NULL)
1558		panic("to be freed page %p has cache", page);
1559	if (page->IsMapped())
1560		panic("to be freed page %p has mappings", page);
1561
1562	if (fromQueue != NULL)
1563		fromQueue->RemoveUnlocked(page);
1564
1565	TA(FreePage(page->physical_page_number));
1566
1567#if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
1568	page->allocation_tracking_info.Clear();
1569#endif
1570
1571	ReadLocker locker(sFreePageQueuesLock);
1572
1573	DEBUG_PAGE_ACCESS_END(page);
1574
1575	if (clear) {
1576		page->SetState(PAGE_STATE_CLEAR);
1577		sClearPageQueue.PrependUnlocked(page);
1578	} else {
1579		page->SetState(PAGE_STATE_FREE);
1580		sFreePageQueue.PrependUnlocked(page);
1581		sFreePageCondition.NotifyAll();
1582	}
1583
1584	locker.Unlock();
1585}
1586
1587
1588/*!	The caller must make sure that no-one else tries to change the page's state
1589	while the function is called. If the page has a cache, this can be done by
1590	locking the cache.
1591*/
1592static void
1593set_page_state(vm_page *page, int pageState)
1594{
1595	DEBUG_PAGE_ACCESS_CHECK(page);
1596
1597	if (pageState == page->State())
1598		return;
1599
1600	VMPageQueue* fromQueue;
1601
1602	switch (page->State()) {
1603		case PAGE_STATE_ACTIVE:
1604			fromQueue = &sActivePageQueue;
1605			break;
1606		case PAGE_STATE_INACTIVE:
1607			fromQueue = &sInactivePageQueue;
1608			break;
1609		case PAGE_STATE_MODIFIED:
1610			fromQueue = &sModifiedPageQueue;
1611			break;
1612		case PAGE_STATE_CACHED:
1613			fromQueue = &sCachedPageQueue;
1614			break;
1615		case PAGE_STATE_FREE:
1616		case PAGE_STATE_CLEAR:
1617			panic("set_page_state(): page %p is free/clear", page);
1618			return;
1619		case PAGE_STATE_WIRED:
1620		case PAGE_STATE_UNUSED:
1621			fromQueue = NULL;
1622			break;
1623		default:
1624			panic("set_page_state(): page %p in invalid state %d",
1625				page, page->State());
1626			return;
1627	}
1628
1629	VMPageQueue* toQueue;
1630
1631	switch (pageState) {
1632		case PAGE_STATE_ACTIVE:
1633			toQueue = &sActivePageQueue;
1634			break;
1635		case PAGE_STATE_INACTIVE:
1636			toQueue = &sInactivePageQueue;
1637			break;
1638		case PAGE_STATE_MODIFIED:
1639			toQueue = &sModifiedPageQueue;
1640			break;
1641		case PAGE_STATE_CACHED:
1642			PAGE_ASSERT(page, !page->IsMapped());
1643			PAGE_ASSERT(page, !page->modified);
1644			toQueue = &sCachedPageQueue;
1645			break;
1646		case PAGE_STATE_FREE:
1647		case PAGE_STATE_CLEAR:
1648			panic("set_page_state(): target state is free/clear");
1649			return;
1650		case PAGE_STATE_WIRED:
1651		case PAGE_STATE_UNUSED:
1652			toQueue = NULL;
1653			break;
1654		default:
1655			panic("set_page_state(): invalid target state %d", pageState);
1656			return;
1657	}
1658
1659	VMCache* cache = page->Cache();
1660	if (cache != NULL && cache->temporary) {
1661		if (pageState == PAGE_STATE_MODIFIED)
1662			atomic_add(&sModifiedTemporaryPages, 1);
1663		else if (page->State() == PAGE_STATE_MODIFIED)
1664			atomic_add(&sModifiedTemporaryPages, -1);
1665	}
1666
1667	// move the page
1668	if (toQueue == fromQueue) {
1669		// Note: Theoretically we are required to lock when changing the page
1670		// state, even if we don't change the queue. We actually don't have to
1671		// do this, though, since only for the active queue there are different
1672		// page states and active pages have a cache that must be locked at
1673		// this point. So we rely on the fact that everyone must lock the cache
1674		// before trying to change/interpret the page state.
1675		PAGE_ASSERT(page, cache != NULL);
1676		cache->AssertLocked();
1677		page->SetState(pageState);
1678	} else {
1679		if (fromQueue != NULL)
1680			fromQueue->RemoveUnlocked(page);
1681
1682		page->SetState(pageState);
1683
1684		if (toQueue != NULL)
1685			toQueue->AppendUnlocked(page);
1686	}
1687}
1688
1689
1690/*! Moves a previously modified page into a now appropriate queue.
1691	The page queues must not be locked.
1692*/
1693static void
1694move_page_to_appropriate_queue(vm_page *page)
1695{
1696	DEBUG_PAGE_ACCESS_CHECK(page);
1697
1698	// Note, this logic must be in sync with what the page daemon does.
1699	int32 state;
1700	if (page->IsMapped())
1701		state = PAGE_STATE_ACTIVE;
1702	else if (page->modified)
1703		state = PAGE_STATE_MODIFIED;
1704	else
1705		state = PAGE_STATE_CACHED;
1706
1707// TODO: If free + cached pages are low, we might directly want to free the
1708// page.
1709	set_page_state(page, state);
1710}
1711
1712
1713static void
1714clear_page(struct vm_page *page)
1715{
1716	vm_memset_physical(page->physical_page_number << PAGE_SHIFT, 0,
1717		B_PAGE_SIZE);
1718}
1719
1720
1721static status_t
1722mark_page_range_in_use(page_num_t startPage, page_num_t length, bool wired)
1723{
1724	TRACE(("mark_page_range_in_use: start %#" B_PRIxPHYSADDR ", len %#"
1725		B_PRIxPHYSADDR "\n", startPage, length));
1726
1727	if (sPhysicalPageOffset > startPage) {
1728		dprintf("mark_page_range_in_use(%#" B_PRIxPHYSADDR ", %#" B_PRIxPHYSADDR
1729			"): start page is before free list\n", startPage, length);
1730		if (sPhysicalPageOffset - startPage >= length)
1731			return B_OK;
1732		length -= sPhysicalPageOffset - startPage;
1733		startPage = sPhysicalPageOffset;
1734	}
1735
1736	startPage -= sPhysicalPageOffset;
1737
1738	if (startPage + length > sNumPages) {
1739		dprintf("mark_page_range_in_use(%#" B_PRIxPHYSADDR ", %#" B_PRIxPHYSADDR
1740			"): range would extend past free list\n", startPage, length);
1741		if (startPage >= sNumPages)
1742			return B_OK;
1743		length = sNumPages - startPage;
1744	}
1745
1746	WriteLocker locker(sFreePageQueuesLock);
1747
1748	for (page_num_t i = 0; i < length; i++) {
1749		vm_page *page = &sPages[startPage + i];
1750		switch (page->State()) {
1751			case PAGE_STATE_FREE:
1752			case PAGE_STATE_CLEAR:
1753			{
1754// TODO: This violates the page reservation policy, since we remove pages from
1755// the free/clear queues without having reserved them before. This should happen
1756// in the early boot process only, though.
1757				DEBUG_PAGE_ACCESS_START(page);
1758				VMPageQueue& queue = page->State() == PAGE_STATE_FREE
1759					? sFreePageQueue : sClearPageQueue;
1760				queue.Remove(page);
1761				page->SetState(wired ? PAGE_STATE_WIRED : PAGE_STATE_UNUSED);
1762				page->busy = false;
1763				atomic_add(&sUnreservedFreePages, -1);
1764				DEBUG_PAGE_ACCESS_END(page);
1765				break;
1766			}
1767			case PAGE_STATE_WIRED:
1768			case PAGE_STATE_UNUSED:
1769				break;
1770			case PAGE_STATE_ACTIVE:
1771			case PAGE_STATE_INACTIVE:
1772			case PAGE_STATE_MODIFIED:
1773			case PAGE_STATE_CACHED:
1774			default:
1775				// uh
1776				dprintf("mark_page_range_in_use: page %#" B_PRIxPHYSADDR
1777					" in non-free state %d!\n", startPage + i, page->State());
1778				break;
1779		}
1780	}
1781
1782	return B_OK;
1783}
1784
1785
1786/*!
1787	This is a background thread that wakes up when its condition is notified
1788	and moves some pages from the free queue over to the clear queue.
1789	Given enough time, it will clear out all pages from the free queue - we
1790	could probably slow it down after having reached a certain threshold.
1791*/
1792static int32
1793page_scrubber(void *unused)
1794{
1795	(void)(unused);
1796
1797	TRACE(("page_scrubber starting...\n"));
1798
1799	ConditionVariableEntry entry;
1800	for (;;) {
1801		while (sFreePageQueue.Count() == 0
1802				|| atomic_get(&sUnreservedFreePages)
1803					< (int32)sFreePagesTarget) {
1804			sFreePageCondition.Add(&entry);
1805			entry.Wait();
1806		}
1807
1808		// Since we temporarily remove pages from the free pages reserve,
1809		// we must make sure we don't cause a violation of the page
1810		// reservation warranty. The following is usually stricter than
1811		// necessary, because we don't have information on how many of the
1812		// reserved pages have already been allocated.
1813		int32 reserved = reserve_some_pages(SCRUB_SIZE,
1814			kPageReserveForPriority[VM_PRIORITY_USER]);
1815		if (reserved == 0)
1816			continue;
1817
1818		// get some pages from the free queue, mostly sorted
1819		ReadLocker locker(sFreePageQueuesLock);
1820
1821		vm_page *page[SCRUB_SIZE];
1822		int32 scrubCount = 0;
1823		for (int32 i = 0; i < reserved; i++) {
1824			page[i] = sFreePageQueue.RemoveHeadUnlocked();
1825			if (page[i] == NULL)
1826				break;
1827
1828			DEBUG_PAGE_ACCESS_START(page[i]);
1829
1830			page[i]->SetState(PAGE_STATE_ACTIVE);
1831			page[i]->busy = true;
1832			scrubCount++;
1833		}
1834
1835		locker.Unlock();
1836
1837		if (scrubCount == 0) {
1838			unreserve_pages(reserved);
1839			continue;
1840		}
1841
1842		TA(ScrubbingPages(scrubCount));
1843
1844		// clear them
1845		for (int32 i = 0; i < scrubCount; i++)
1846			clear_page(page[i]);
1847
1848		locker.Lock();
1849
1850		// and put them into the clear queue
1851		// process the array reversed when prepending to preserve sequential order
1852		for (int32 i = scrubCount - 1; i >= 0; i--) {
1853			page[i]->SetState(PAGE_STATE_CLEAR);
1854			page[i]->busy = false;
1855			DEBUG_PAGE_ACCESS_END(page[i]);
1856			sClearPageQueue.PrependUnlocked(page[i]);
1857		}
1858
1859		locker.Unlock();
1860
1861		unreserve_pages(reserved);
1862
1863		TA(ScrubbedPages(scrubCount));
1864
1865		// wait at least 100ms between runs
1866		snooze(100 * 1000);
1867	}
1868
1869	return 0;
1870}
1871
1872
1873static void
1874init_page_marker(vm_page &marker)
1875{
1876	marker.SetCacheRef(NULL);
1877	marker.InitState(PAGE_STATE_UNUSED);
1878	marker.busy = true;
1879#if DEBUG_PAGE_QUEUE
1880	marker.queue = NULL;
1881#endif
1882#if DEBUG_PAGE_ACCESS
1883	marker.accessing_thread = thread_get_current_thread_id();
1884#endif
1885}
1886
1887
1888static void
1889remove_page_marker(struct vm_page &marker)
1890{
1891	DEBUG_PAGE_ACCESS_CHECK(&marker);
1892
1893	if (marker.State() < PAGE_STATE_FIRST_UNQUEUED)
1894		sPageQueues[marker.State()].RemoveUnlocked(&marker);
1895
1896	marker.SetState(PAGE_STATE_UNUSED);
1897}
1898
1899
1900static vm_page*
1901next_modified_page(page_num_t& maxPagesToSee)
1902{
1903	InterruptsSpinLocker locker(sModifiedPageQueue.GetLock());
1904
1905	while (maxPagesToSee > 0) {
1906		vm_page* page = sModifiedPageQueue.Head();
1907		if (page == NULL)
1908			return NULL;
1909
1910		sModifiedPageQueue.Requeue(page, true);
1911
1912		maxPagesToSee--;
1913
1914		if (!page->busy)
1915			return page;
1916	}
1917
1918	return NULL;
1919}
1920
1921
1922// #pragma mark -
1923
1924
1925class PageWriteTransfer;
1926class PageWriteWrapper;
1927
1928
1929class PageWriterRun {
1930public:
1931	status_t Init(uint32 maxPages);
1932
1933	void PrepareNextRun();
1934	void AddPage(vm_page* page);
1935	uint32 Go();
1936
1937	void PageWritten(PageWriteTransfer* transfer, status_t status,
1938		bool partialTransfer, size_t bytesTransferred);
1939
1940private:
1941	uint32				fMaxPages;
1942	uint32				fWrapperCount;
1943	uint32				fTransferCount;
1944	int32				fPendingTransfers;
1945	PageWriteWrapper*	fWrappers;
1946	PageWriteTransfer*	fTransfers;
1947	ConditionVariable	fAllFinishedCondition;
1948};
1949
1950
1951class PageWriteTransfer : public AsyncIOCallback {
1952public:
1953	void SetTo(PageWriterRun* run, vm_page* page, int32 maxPages);
1954	bool AddPage(vm_page* page);
1955
1956	status_t Schedule(uint32 flags);
1957
1958	void SetStatus(status_t status, size_t transferred);
1959
1960	status_t Status() const	{ return fStatus; }
1961	struct VMCache* Cache() const { return fCache; }
1962	uint32 PageCount() const { return fPageCount; }
1963
1964	virtual void IOFinished(status_t status, bool partialTransfer,
1965		generic_size_t bytesTransferred);
1966private:
1967	PageWriterRun*		fRun;
1968	struct VMCache*		fCache;
1969	off_t				fOffset;
1970	uint32				fPageCount;
1971	int32				fMaxPages;
1972	status_t			fStatus;
1973	uint32				fVecCount;
1974	generic_io_vec		fVecs[32]; // TODO: make dynamic/configurable
1975};
1976
1977
1978class PageWriteWrapper {
1979public:
1980	PageWriteWrapper();
1981	~PageWriteWrapper();
1982	void SetTo(vm_page* page);
1983	bool Done(status_t result);
1984
1985private:
1986	vm_page*			fPage;
1987	struct VMCache*		fCache;
1988	bool				fIsActive;
1989};
1990
1991
1992PageWriteWrapper::PageWriteWrapper()
1993	:
1994	fIsActive(false)
1995{
1996}
1997
1998
1999PageWriteWrapper::~PageWriteWrapper()
2000{
2001	if (fIsActive)
2002		panic("page write wrapper going out of scope but isn't completed");
2003}
2004
2005
2006/*!	The page's cache must be locked.
2007*/
2008void
2009PageWriteWrapper::SetTo(vm_page* page)
2010{
2011	DEBUG_PAGE_ACCESS_CHECK(page);
2012
2013	if (page->busy)
2014		panic("setting page write wrapper to busy page");
2015
2016	if (fIsActive)
2017		panic("re-setting page write wrapper that isn't completed");
2018
2019	fPage = page;
2020	fCache = page->Cache();
2021	fIsActive = true;
2022
2023	fPage->busy = true;
2024	fPage->busy_writing = true;
2025
2026	// We have a modified page -- however, while we're writing it back,
2027	// the page might still be mapped. In order not to lose any changes to the
2028	// page, we mark it clean before actually writing it back; if
2029	// writing the page fails for some reason, we'll just keep it in the
2030	// modified page list, but that should happen only rarely.
2031
2032	// If the page is changed after we cleared the dirty flag, but before we
2033	// had the chance to write it back, then we'll write it again later -- that
2034	// will probably not happen that often, though.
2035
2036	vm_clear_map_flags(fPage, PAGE_MODIFIED);
2037}
2038
2039
2040/*!	The page's cache must be locked.
2041	The page queues must not be locked.
2042	\return \c true if the page was written successfully respectively could be
2043		handled somehow, \c false otherwise.
2044*/
2045bool
2046PageWriteWrapper::Done(status_t result)
2047{
2048	if (!fIsActive)
2049		panic("completing page write wrapper that is not active");
2050
2051	DEBUG_PAGE_ACCESS_START(fPage);
2052
2053	fPage->busy = false;
2054		// Set unbusy and notify later by hand, since we might free the page.
2055
2056	bool success = true;
2057
2058	if (result == B_OK) {
2059		// put it into the active/inactive queue
2060		move_page_to_appropriate_queue(fPage);
2061		fPage->busy_writing = false;
2062		DEBUG_PAGE_ACCESS_END(fPage);
2063	} else {
2064		// Writing the page failed. One reason would be that the cache has been
2065		// shrunk and the page does no longer belong to the file. Otherwise the
2066		// actual I/O failed, in which case we'll simply keep the page modified.
2067
2068		if (!fPage->busy_writing) {
2069			// The busy_writing flag was cleared. That means the cache has been
2070			// shrunk while we were trying to write the page and we have to free
2071			// it now.
2072			vm_remove_all_page_mappings(fPage);
2073// TODO: Unmapping should already happen when resizing the cache!
2074			fCache->RemovePage(fPage);
2075			free_page(fPage, false);
2076			unreserve_pages(1);
2077		} else {
2078			// Writing the page failed -- mark the page modified and move it to
2079			// an appropriate queue other than the modified queue, so we don't
2080			// keep trying to write it over and over again. We keep
2081			// non-temporary pages in the modified queue, though, so they don't
2082			// get lost in the inactive queue.
2083			dprintf("PageWriteWrapper: Failed to write page %p: %s\n", fPage,
2084				strerror(result));
2085
2086			fPage->modified = true;
2087			if (!fCache->temporary)
2088				set_page_state(fPage, PAGE_STATE_MODIFIED);
2089			else if (fPage->IsMapped())
2090				set_page_state(fPage, PAGE_STATE_ACTIVE);
2091			else
2092				set_page_state(fPage, PAGE_STATE_INACTIVE);
2093
2094			fPage->busy_writing = false;
2095			DEBUG_PAGE_ACCESS_END(fPage);
2096
2097			success = false;
2098		}
2099	}
2100
2101	fCache->NotifyPageEvents(fPage, PAGE_EVENT_NOT_BUSY);
2102	fIsActive = false;
2103
2104	return success;
2105}
2106
2107
2108/*!	The page's cache must be locked.
2109*/
2110void
2111PageWriteTransfer::SetTo(PageWriterRun* run, vm_page* page, int32 maxPages)
2112{
2113	fRun = run;
2114	fCache = page->Cache();
2115	fOffset = page->cache_offset;
2116	fPageCount = 1;
2117	fMaxPages = maxPages;
2118	fStatus = B_OK;
2119
2120	fVecs[0].base = (phys_addr_t)page->physical_page_number << PAGE_SHIFT;
2121	fVecs[0].length = B_PAGE_SIZE;
2122	fVecCount = 1;
2123}
2124
2125
2126/*!	The page's cache must be locked.
2127*/
2128bool
2129PageWriteTransfer::AddPage(vm_page* page)
2130{
2131	if (page->Cache() != fCache
2132		|| (fMaxPages >= 0 && fPageCount >= (uint32)fMaxPages))
2133		return false;
2134
2135	phys_addr_t nextBase = fVecs[fVecCount - 1].base
2136		+ fVecs[fVecCount - 1].length;
2137
2138	if ((phys_addr_t)page->physical_page_number << PAGE_SHIFT == nextBase
2139		&& (off_t)page->cache_offset == fOffset + fPageCount) {
2140		// append to last iovec
2141		fVecs[fVecCount - 1].length += B_PAGE_SIZE;
2142		fPageCount++;
2143		return true;
2144	}
2145
2146	nextBase = fVecs[0].base - B_PAGE_SIZE;
2147	if ((phys_addr_t)page->physical_page_number << PAGE_SHIFT == nextBase
2148		&& (off_t)page->cache_offset == fOffset - 1) {
2149		// prepend to first iovec and adjust offset
2150		fVecs[0].base = nextBase;
2151		fVecs[0].length += B_PAGE_SIZE;
2152		fOffset = page->cache_offset;
2153		fPageCount++;
2154		return true;
2155	}
2156
2157	if (((off_t)page->cache_offset == fOffset + fPageCount
2158			|| (off_t)page->cache_offset == fOffset - 1)
2159		&& fVecCount < sizeof(fVecs) / sizeof(fVecs[0])) {
2160		// not physically contiguous or not in the right order
2161		uint32 vectorIndex;
2162		if ((off_t)page->cache_offset < fOffset) {
2163			// we are pre-pending another vector, move the other vecs
2164			for (uint32 i = fVecCount; i > 0; i--)
2165				fVecs[i] = fVecs[i - 1];
2166
2167			fOffset = page->cache_offset;
2168			vectorIndex = 0;
2169		} else
2170			vectorIndex = fVecCount;
2171
2172		fVecs[vectorIndex].base
2173			= (phys_addr_t)page->physical_page_number << PAGE_SHIFT;
2174		fVecs[vectorIndex].length = B_PAGE_SIZE;
2175
2176		fVecCount++;
2177		fPageCount++;
2178		return true;
2179	}
2180
2181	return false;
2182}
2183
2184
2185status_t
2186PageWriteTransfer::Schedule(uint32 flags)
2187{
2188	off_t writeOffset = (off_t)fOffset << PAGE_SHIFT;
2189	generic_size_t writeLength = (phys_size_t)fPageCount << PAGE_SHIFT;
2190
2191	if (fRun != NULL) {
2192		return fCache->WriteAsync(writeOffset, fVecs, fVecCount, writeLength,
2193			flags | B_PHYSICAL_IO_REQUEST, this);
2194	}
2195
2196	status_t status = fCache->Write(writeOffset, fVecs, fVecCount,
2197		flags | B_PHYSICAL_IO_REQUEST, &writeLength);
2198
2199	SetStatus(status, writeLength);
2200	return fStatus;
2201}
2202
2203
2204void
2205PageWriteTransfer::SetStatus(status_t status, size_t transferred)
2206{
2207	// only succeed if all pages up to the last one have been written fully
2208	// and the last page has at least been written partially
2209	if (status == B_OK && transferred <= (fPageCount - 1) * B_PAGE_SIZE)
2210		status = B_ERROR;
2211
2212	fStatus = status;
2213}
2214
2215
2216void
2217PageWriteTransfer::IOFinished(status_t status, bool partialTransfer,
2218	generic_size_t bytesTransferred)
2219{
2220	SetStatus(status, bytesTransferred);
2221	fRun->PageWritten(this, fStatus, partialTransfer, bytesTransferred);
2222}
2223
2224
2225status_t
2226PageWriterRun::Init(uint32 maxPages)
2227{
2228	fMaxPages = maxPages;
2229	fWrapperCount = 0;
2230	fTransferCount = 0;
2231	fPendingTransfers = 0;
2232
2233	fWrappers = new(std::nothrow) PageWriteWrapper[maxPages];
2234	fTransfers = new(std::nothrow) PageWriteTransfer[maxPages];
2235	if (fWrappers == NULL || fTransfers == NULL)
2236		return B_NO_MEMORY;
2237
2238	return B_OK;
2239}
2240
2241
2242void
2243PageWriterRun::PrepareNextRun()
2244{
2245	fWrapperCount = 0;
2246	fTransferCount = 0;
2247	fPendingTransfers = 0;
2248}
2249
2250
2251/*!	The page's cache must be locked.
2252*/
2253void
2254PageWriterRun::AddPage(vm_page* page)
2255{
2256	fWrappers[fWrapperCount++].SetTo(page);
2257
2258	if (fTransferCount == 0 || !fTransfers[fTransferCount - 1].AddPage(page)) {
2259		fTransfers[fTransferCount++].SetTo(this, page,
2260			page->Cache()->MaxPagesPerAsyncWrite());
2261	}
2262}
2263
2264
2265/*!	Writes all pages previously added.
2266	\return The number of pages that could not be written or otherwise handled.
2267*/
2268uint32
2269PageWriterRun::Go()
2270{
2271	atomic_set(&fPendingTransfers, fTransferCount);
2272
2273	fAllFinishedCondition.Init(this, "page writer wait for I/O");
2274	ConditionVariableEntry waitEntry;
2275	fAllFinishedCondition.Add(&waitEntry);
2276
2277	// schedule writes
2278	for (uint32 i = 0; i < fTransferCount; i++)
2279		fTransfers[i].Schedule(B_VIP_IO_REQUEST);
2280
2281	// wait until all pages have been written
2282	waitEntry.Wait();
2283
2284	// mark pages depending on whether they could be written or not
2285
2286	uint32 failedPages = 0;
2287	uint32 wrapperIndex = 0;
2288	for (uint32 i = 0; i < fTransferCount; i++) {
2289		PageWriteTransfer& transfer = fTransfers[i];
2290		transfer.Cache()->Lock();
2291
2292		for (uint32 j = 0; j < transfer.PageCount(); j++) {
2293			if (!fWrappers[wrapperIndex++].Done(transfer.Status()))
2294				failedPages++;
2295		}
2296
2297		transfer.Cache()->Unlock();
2298	}
2299
2300	ASSERT(wrapperIndex == fWrapperCount);
2301
2302	for (uint32 i = 0; i < fTransferCount; i++) {
2303		PageWriteTransfer& transfer = fTransfers[i];
2304		struct VMCache* cache = transfer.Cache();
2305
2306		// We've acquired a references for each page
2307		for (uint32 j = 0; j < transfer.PageCount(); j++) {
2308			// We release the cache references after all pages were made
2309			// unbusy again - otherwise releasing a vnode could deadlock.
2310			cache->ReleaseStoreRef();
2311			cache->ReleaseRef();
2312		}
2313	}
2314
2315	return failedPages;
2316}
2317
2318
2319void
2320PageWriterRun::PageWritten(PageWriteTransfer* transfer, status_t status,
2321	bool partialTransfer, size_t bytesTransferred)
2322{
2323	if (atomic_add(&fPendingTransfers, -1) == 1)
2324		fAllFinishedCondition.NotifyAll();
2325}
2326
2327
2328/*!	The page writer continuously takes some pages from the modified
2329	queue, writes them back, and moves them back to the active queue.
2330	It runs in its own thread, and is only there to keep the number
2331	of modified pages low, so that more pages can be reused with
2332	fewer costs.
2333*/
2334status_t
2335page_writer(void* /*unused*/)
2336{
2337	const uint32 kNumPages = 256;
2338#ifdef TRACE_VM_PAGE
2339	uint32 writtenPages = 0;
2340	bigtime_t lastWrittenTime = 0;
2341	bigtime_t pageCollectionTime = 0;
2342	bigtime_t pageWritingTime = 0;
2343#endif
2344
2345	PageWriterRun run;
2346	if (run.Init(kNumPages) != B_OK) {
2347		panic("page writer: Failed to init PageWriterRun!");
2348		return B_ERROR;
2349	}
2350
2351	page_num_t pagesSinceLastSuccessfulWrite = 0;
2352
2353	while (true) {
2354// TODO: Maybe wait shorter when memory is low!
2355		if (sModifiedPageQueue.Count() < kNumPages) {
2356			sPageWriterCondition.Wait(3000000, true);
2357				// all 3 seconds when no one triggers us
2358		}
2359
2360		page_num_t modifiedPages = sModifiedPageQueue.Count();
2361		if (modifiedPages == 0)
2362			continue;
2363
2364		if (modifiedPages <= pagesSinceLastSuccessfulWrite) {
2365			// We ran through the whole queue without being able to write a
2366			// single page. Take a break.
2367			snooze(500000);
2368			pagesSinceLastSuccessfulWrite = 0;
2369		}
2370
2371#if ENABLE_SWAP_SUPPORT
2372		page_stats pageStats;
2373		get_page_stats(pageStats);
2374		bool activePaging = do_active_paging(pageStats);
2375#endif
2376
2377		// depending on how urgent it becomes to get pages to disk, we adjust
2378		// our I/O priority
2379		uint32 lowPagesState = low_resource_state(B_KERNEL_RESOURCE_PAGES);
2380		int32 ioPriority = B_IDLE_PRIORITY;
2381		if (lowPagesState >= B_LOW_RESOURCE_CRITICAL
2382			|| modifiedPages > MAX_PAGE_WRITER_IO_PRIORITY_THRESHOLD) {
2383			ioPriority = MAX_PAGE_WRITER_IO_PRIORITY;
2384		} else {
2385			ioPriority = (uint64)MAX_PAGE_WRITER_IO_PRIORITY * modifiedPages
2386				/ MAX_PAGE_WRITER_IO_PRIORITY_THRESHOLD;
2387		}
2388
2389		thread_set_io_priority(ioPriority);
2390
2391		uint32 numPages = 0;
2392		run.PrepareNextRun();
2393
2394		// TODO: make this laptop friendly, too (ie. only start doing
2395		// something if someone else did something or there is really
2396		// enough to do).
2397
2398		// collect pages to be written
2399#ifdef TRACE_VM_PAGE
2400		pageCollectionTime -= system_time();
2401#endif
2402
2403		page_num_t maxPagesToSee = modifiedPages;
2404
2405		while (numPages < kNumPages && maxPagesToSee > 0) {
2406			vm_page *page = next_modified_page(maxPagesToSee);
2407			if (page == NULL)
2408				break;
2409
2410			PageCacheLocker cacheLocker(page, false);
2411			if (!cacheLocker.IsLocked())
2412				continue;
2413
2414			VMCache *cache = page->Cache();
2415
2416			// If the page is busy or its state has changed while we were
2417			// locking the cache, just ignore it.
2418			if (page->busy || page->State() != PAGE_STATE_MODIFIED)
2419				continue;
2420
2421			DEBUG_PAGE_ACCESS_START(page);
2422
2423			// Don't write back wired (locked) pages.
2424			if (page->WiredCount() > 0) {
2425				set_page_state(page, PAGE_STATE_ACTIVE);
2426				DEBUG_PAGE_ACCESS_END(page);
2427				continue;
2428			}
2429
2430			// Write back temporary pages only when we're actively paging.
2431			if (cache->temporary
2432#if ENABLE_SWAP_SUPPORT
2433				&& (!activePaging
2434					|| !cache->CanWritePage(
2435							(off_t)page->cache_offset << PAGE_SHIFT))
2436#endif
2437				) {
2438				// We can't/don't want to do anything with this page, so move it
2439				// to one of the other queues.
2440				if (page->mappings.IsEmpty())
2441					set_page_state(page, PAGE_STATE_INACTIVE);
2442				else
2443					set_page_state(page, PAGE_STATE_ACTIVE);
2444
2445				DEBUG_PAGE_ACCESS_END(page);
2446				continue;
2447			}
2448
2449			// We need our own reference to the store, as it might currently be
2450			// destroyed.
2451			if (cache->AcquireUnreferencedStoreRef() != B_OK) {
2452				DEBUG_PAGE_ACCESS_END(page);
2453				cacheLocker.Unlock();
2454				thread_yield();
2455				continue;
2456			}
2457
2458			run.AddPage(page);
2459				// TODO: We're possibly adding pages of different caches and
2460				// thus maybe of different underlying file systems here. This
2461				// is a potential problem for loop file systems/devices, since
2462				// we could mark a page busy that would need to be accessed
2463				// when writing back another page, thus causing a deadlock.
2464
2465			DEBUG_PAGE_ACCESS_END(page);
2466
2467			//dprintf("write page %p, cache %p (%ld)\n", page, page->cache, page->cache->ref_count);
2468			TPW(WritePage(page));
2469
2470			cache->AcquireRefLocked();
2471			numPages++;
2472		}
2473
2474#ifdef TRACE_VM_PAGE
2475		pageCollectionTime += system_time();
2476#endif
2477		if (numPages == 0)
2478			continue;
2479
2480		// write pages to disk and do all the cleanup
2481#ifdef TRACE_VM_PAGE
2482		pageWritingTime -= system_time();
2483#endif
2484		uint32 failedPages = run.Go();
2485#ifdef TRACE_VM_PAGE
2486		pageWritingTime += system_time();
2487
2488		// debug output only...
2489		writtenPages += numPages;
2490		if (writtenPages >= 1024) {
2491			bigtime_t now = system_time();
2492			TRACE(("page writer: wrote 1024 pages (total: %" B_PRIu64 " ms, "
2493				"collect: %" B_PRIu64 " ms, write: %" B_PRIu64 " ms)\n",
2494				(now - lastWrittenTime) / 1000,
2495				pageCollectionTime / 1000, pageWritingTime / 1000));
2496			lastWrittenTime = now;
2497
2498			writtenPages -= 1024;
2499			pageCollectionTime = 0;
2500			pageWritingTime = 0;
2501		}
2502#endif
2503
2504		if (failedPages == numPages)
2505			pagesSinceLastSuccessfulWrite += modifiedPages - maxPagesToSee;
2506		else
2507			pagesSinceLastSuccessfulWrite = 0;
2508	}
2509
2510	return B_OK;
2511}
2512
2513
2514// #pragma mark -
2515
2516
2517// TODO: This should be done in the page daemon!
2518#if 0
2519#if ENABLE_SWAP_SUPPORT
2520static bool
2521free_page_swap_space(int32 index)
2522{
2523	vm_page *page = vm_page_at_index(index);
2524	PageCacheLocker locker(page);
2525	if (!locker.IsLocked())
2526		return false;
2527
2528	DEBUG_PAGE_ACCESS_START(page);
2529
2530	VMCache* cache = page->Cache();
2531	if (cache->temporary && page->WiredCount() == 0
2532			&& cache->HasPage(page->cache_offset << PAGE_SHIFT)
2533			&& page->usage_count > 0) {
2534		// TODO: how to judge a page is highly active?
2535		if (swap_free_page_swap_space(page)) {
2536			// We need to mark the page modified, since otherwise it could be
2537			// stolen and we'd lose its data.
2538			vm_page_set_state(page, PAGE_STATE_MODIFIED);
2539			TD(FreedPageSwap(page));
2540			DEBUG_PAGE_ACCESS_END(page);
2541			return true;
2542		}
2543	}
2544	DEBUG_PAGE_ACCESS_END(page);
2545	return false;
2546}
2547#endif
2548#endif	// 0
2549
2550
2551static vm_page *
2552find_cached_page_candidate(struct vm_page &marker)
2553{
2554	DEBUG_PAGE_ACCESS_CHECK(&marker);
2555
2556	InterruptsSpinLocker locker(sCachedPageQueue.GetLock());
2557	vm_page *page;
2558
2559	if (marker.State() == PAGE_STATE_UNUSED) {
2560		// Get the first free pages of the (in)active queue
2561		page = sCachedPageQueue.Head();
2562	} else {
2563		// Get the next page of the current queue
2564		if (marker.State() != PAGE_STATE_CACHED) {
2565			panic("invalid marker %p state", &marker);
2566			return NULL;
2567		}
2568
2569		page = sCachedPageQueue.Next(&marker);
2570		sCachedPageQueue.Remove(&marker);
2571		marker.SetState(PAGE_STATE_UNUSED);
2572	}
2573
2574	while (page != NULL) {
2575		if (!page->busy) {
2576			// we found a candidate, insert marker
2577			marker.SetState(PAGE_STATE_CACHED);
2578			sCachedPageQueue.InsertAfter(page, &marker);
2579			return page;
2580		}
2581
2582		page = sCachedPageQueue.Next(page);
2583	}
2584
2585	return NULL;
2586}
2587
2588
2589static bool
2590free_cached_page(vm_page *page, bool dontWait)
2591{
2592	// try to lock the page's cache
2593	if (vm_cache_acquire_locked_page_cache(page, dontWait) == NULL)
2594		return false;
2595	VMCache* cache = page->Cache();
2596
2597	AutoLocker<VMCache> cacheLocker(cache, true);
2598	MethodDeleter<VMCache, void, &VMCache::ReleaseRefLocked> _2(cache);
2599
2600	// check again if that page is still a candidate
2601	if (page->busy || page->State() != PAGE_STATE_CACHED)
2602		return false;
2603
2604	DEBUG_PAGE_ACCESS_START(page);
2605
2606	PAGE_ASSERT(page, !page->IsMapped());
2607	PAGE_ASSERT(page, !page->modified);
2608
2609	// we can now steal this page
2610
2611	cache->RemovePage(page);
2612		// Now the page doesn't have cache anymore, so no one else (e.g.
2613		// vm_page_allocate_page_run() can pick it up), since they would be
2614		// required to lock the cache first, which would fail.
2615
2616	sCachedPageQueue.RemoveUnlocked(page);
2617	return true;
2618}
2619
2620
2621static uint32
2622free_cached_pages(uint32 pagesToFree, bool dontWait)
2623{
2624	vm_page marker;
2625	init_page_marker(marker);
2626
2627	uint32 pagesFreed = 0;
2628
2629	while (pagesFreed < pagesToFree) {
2630		vm_page *page = find_cached_page_candidate(marker);
2631		if (page == NULL)
2632			break;
2633
2634		if (free_cached_page(page, dontWait)) {
2635			ReadLocker locker(sFreePageQueuesLock);
2636			page->SetState(PAGE_STATE_FREE);
2637			DEBUG_PAGE_ACCESS_END(page);
2638			sFreePageQueue.PrependUnlocked(page);
2639			locker.Unlock();
2640
2641			TA(StolenPage());
2642
2643			pagesFreed++;
2644		}
2645	}
2646
2647	remove_page_marker(marker);
2648
2649	sFreePageCondition.NotifyAll();
2650
2651	return pagesFreed;
2652}
2653
2654
2655static void
2656idle_scan_active_pages(page_stats& pageStats)
2657{
2658	VMPageQueue& queue = sActivePageQueue;
2659
2660	// We want to scan the whole queue in roughly kIdleRunsForFullQueue runs.
2661	uint32 maxToScan = queue.Count() / kIdleRunsForFullQueue + 1;
2662
2663	while (maxToScan > 0) {
2664		maxToScan--;
2665
2666		// Get the next page. Note that we don't bother to lock here. We go with
2667		// the assumption that on all architectures reading/writing pointers is
2668		// atomic. Beyond that it doesn't really matter. We have to unlock the
2669		// queue anyway to lock the page's cache, and we'll recheck afterwards.
2670		vm_page* page = queue.Head();
2671		if (page == NULL)
2672			break;
2673
2674		// lock the page's cache
2675		VMCache* cache = vm_cache_acquire_locked_page_cache(page, true);
2676		if (cache == NULL)
2677			continue;
2678
2679		if (page->State() != PAGE_STATE_ACTIVE) {
2680			// page is no longer in the cache or in this queue
2681			cache->ReleaseRefAndUnlock();
2682			continue;
2683		}
2684
2685		if (page->busy) {
2686			// page is busy -- requeue at the end
2687			vm_page_requeue(page, true);
2688			cache->ReleaseRefAndUnlock();
2689			continue;
2690		}
2691
2692		DEBUG_PAGE_ACCESS_START(page);
2693
2694		// Get the page active/modified flags and update the page's usage count.
2695		// We completely unmap inactive temporary pages. This saves us to
2696		// iterate through the inactive list as well, since we'll be notified
2697		// via page fault whenever such an inactive page is used again.
2698		// We don't remove the mappings of non-temporary pages, since we
2699		// wouldn't notice when those would become unused and could thus be
2700		// moved to the cached list.
2701		int32 usageCount;
2702		if (page->WiredCount() > 0 || page->usage_count > 0
2703			|| !cache->temporary) {
2704			usageCount = vm_clear_page_mapping_accessed_flags(page);
2705		} else
2706			usageCount = vm_remove_all_page_mappings_if_unaccessed(page);
2707
2708		if (usageCount > 0) {
2709			usageCount += page->usage_count + kPageUsageAdvance;
2710			if (usageCount > kPageUsageMax)
2711				usageCount = kPageUsageMax;
2712// TODO: This would probably also be the place to reclaim swap space.
2713		} else {
2714			usageCount += page->usage_count - (int32)kPageUsageDecline;
2715			if (usageCount < 0) {
2716				usageCount = 0;
2717				set_page_state(page, PAGE_STATE_INACTIVE);
2718			}
2719		}
2720
2721		page->usage_count = usageCount;
2722
2723		DEBUG_PAGE_ACCESS_END(page);
2724
2725		cache->ReleaseRefAndUnlock();
2726	}
2727}
2728
2729
2730static void
2731full_scan_inactive_pages(page_stats& pageStats, int32 despairLevel)
2732{
2733	int32 pagesToFree = pageStats.unsatisfiedReservations
2734		+ sFreeOrCachedPagesTarget
2735		- (pageStats.totalFreePages + pageStats.cachedPages);
2736	if (pagesToFree <= 0)
2737		return;
2738
2739	bigtime_t time = system_time();
2740	uint32 pagesScanned = 0;
2741	uint32 pagesToCached = 0;
2742	uint32 pagesToModified = 0;
2743	uint32 pagesToActive = 0;
2744
2745	// Determine how many pages at maximum to send to the modified queue. Since
2746	// it is relatively expensive to page out pages, we do that on a grander
2747	// scale only when things get desperate.
2748	uint32 maxToFlush = despairLevel <= 1 ? 32 : 10000;
2749
2750	vm_page marker;
2751	init_page_marker(marker);
2752
2753	VMPageQueue& queue = sInactivePageQueue;
2754	InterruptsSpinLocker queueLocker(queue.GetLock());
2755	uint32 maxToScan = queue.Count();
2756
2757	vm_page* nextPage = queue.Head();
2758
2759	while (pagesToFree > 0 && maxToScan > 0) {
2760		maxToScan--;
2761
2762		// get the next page
2763		vm_page* page = nextPage;
2764		if (page == NULL)
2765			break;
2766		nextPage = queue.Next(page);
2767
2768		if (page->busy)
2769			continue;
2770
2771		// mark the position
2772		queue.InsertAfter(page, &marker);
2773		queueLocker.Unlock();
2774
2775		// lock the page's cache
2776		VMCache* cache = vm_cache_acquire_locked_page_cache(page, true);
2777		if (cache == NULL || page->busy
2778				|| page->State() != PAGE_STATE_INACTIVE) {
2779			if (cache != NULL)
2780				cache->ReleaseRefAndUnlock();
2781			queueLocker.Lock();
2782			nextPage = queue.Next(&marker);
2783			queue.Remove(&marker);
2784			continue;
2785		}
2786
2787		pagesScanned++;
2788
2789		DEBUG_PAGE_ACCESS_START(page);
2790
2791		// Get the accessed count, clear the accessed/modified flags and
2792		// unmap the page, if it hasn't been accessed.
2793		int32 usageCount;
2794		if (page->WiredCount() > 0)
2795			usageCount = vm_clear_page_mapping_accessed_flags(page);
2796		else
2797			usageCount = vm_remove_all_page_mappings_if_unaccessed(page);
2798
2799		// update usage count
2800		if (usageCount > 0) {
2801			usageCount += page->usage_count + kPageUsageAdvance;
2802			if (usageCount > kPageUsageMax)
2803				usageCount = kPageUsageMax;
2804		} else {
2805			usageCount += page->usage_count - (int32)kPageUsageDecline;
2806			if (usageCount < 0)
2807				usageCount = 0;
2808		}
2809
2810		page->usage_count = usageCount;
2811
2812		// Move to fitting queue or requeue:
2813		// * Active mapped pages go to the active queue.
2814		// * Inactive mapped (i.e. wired) pages are requeued.
2815		// * The remaining pages are cachable. Thus, if unmodified they go to
2816		//   the cached queue, otherwise to the modified queue (up to a limit).
2817		//   Note that until in the idle scanning we don't exempt pages of
2818		//   temporary caches. Apparently we really need memory, so we better
2819		//   page out memory as well.
2820		bool isMapped = page->IsMapped();
2821		if (usageCount > 0) {
2822			if (isMapped) {
2823				set_page_state(page, PAGE_STATE_ACTIVE);
2824				pagesToActive++;
2825			} else
2826				vm_page_requeue(page, true);
2827		} else if (isMapped) {
2828			vm_page_requeue(page, true);
2829		} else if (!page->modified) {
2830			set_page_state(page, PAGE_STATE_CACHED);
2831			pagesToFree--;
2832			pagesToCached++;
2833		} else if (maxToFlush > 0) {
2834			set_page_state(page, PAGE_STATE_MODIFIED);
2835			maxToFlush--;
2836			pagesToModified++;
2837		} else
2838			vm_page_requeue(page, true);
2839
2840		DEBUG_PAGE_ACCESS_END(page);
2841
2842		cache->ReleaseRefAndUnlock();
2843
2844		// remove the marker
2845		queueLocker.Lock();
2846		nextPage = queue.Next(&marker);
2847		queue.Remove(&marker);
2848	}
2849
2850	queueLocker.Unlock();
2851
2852	time = system_time() - time;
2853	TRACE_DAEMON("  -> inactive scan (%7" B_PRId64 " us): scanned: %7" B_PRIu32
2854		", moved: %" B_PRIu32 " -> cached, %" B_PRIu32 " -> modified, %"
2855		B_PRIu32 " -> active\n", time, pagesScanned, pagesToCached,
2856		pagesToModified, pagesToActive);
2857
2858	// wake up the page writer, if we tossed it some pages
2859	if (pagesToModified > 0)
2860		sPageWriterCondition.WakeUp();
2861}
2862
2863
2864static void
2865full_scan_active_pages(page_stats& pageStats, int32 despairLevel)
2866{
2867	vm_page marker;
2868	init_page_marker(marker);
2869
2870	VMPageQueue& queue = sActivePageQueue;
2871	InterruptsSpinLocker queueLocker(queue.GetLock());
2872	uint32 maxToScan = queue.Count();
2873
2874	int32 pagesToDeactivate = pageStats.unsatisfiedReservations
2875		+ sFreeOrCachedPagesTarget
2876		- (pageStats.totalFreePages + pageStats.cachedPages)
2877		+ std::max((int32)sInactivePagesTarget - (int32)maxToScan, (int32)0);
2878	if (pagesToDeactivate <= 0)
2879		return;
2880
2881	bigtime_t time = system_time();
2882	uint32 pagesAccessed = 0;
2883	uint32 pagesToInactive = 0;
2884	uint32 pagesScanned = 0;
2885
2886	vm_page* nextPage = queue.Head();
2887
2888	while (pagesToDeactivate > 0 && maxToScan > 0) {
2889		maxToScan--;
2890
2891		// get the next page
2892		vm_page* page = nextPage;
2893		if (page == NULL)
2894			break;
2895		nextPage = queue.Next(page);
2896
2897		if (page->busy)
2898			continue;
2899
2900		// mark the position
2901		queue.InsertAfter(page, &marker);
2902		queueLocker.Unlock();
2903
2904		// lock the page's cache
2905		VMCache* cache = vm_cache_acquire_locked_page_cache(page, true);
2906		if (cache == NULL || page->busy || page->State() != PAGE_STATE_ACTIVE) {
2907			if (cache != NULL)
2908				cache->ReleaseRefAndUnlock();
2909			queueLocker.Lock();
2910			nextPage = queue.Next(&marker);
2911			queue.Remove(&marker);
2912			continue;
2913		}
2914
2915		pagesScanned++;
2916
2917		DEBUG_PAGE_ACCESS_START(page);
2918
2919		// Get the page active/modified flags and update the page's usage count.
2920		int32 usageCount = vm_clear_page_mapping_accessed_flags(page);
2921
2922		if (usageCount > 0) {
2923			usageCount += page->usage_count + kPageUsageAdvance;
2924			if (usageCount > kPageUsageMax)
2925				usageCount = kPageUsageMax;
2926			pagesAccessed++;
2927// TODO: This would probably also be the place to reclaim swap space.
2928		} else {
2929			usageCount += page->usage_count - (int32)kPageUsageDecline;
2930			if (usageCount <= 0) {
2931				usageCount = 0;
2932				set_page_state(page, PAGE_STATE_INACTIVE);
2933				pagesToInactive++;
2934			}
2935		}
2936
2937		page->usage_count = usageCount;
2938
2939		DEBUG_PAGE_ACCESS_END(page);
2940
2941		cache->ReleaseRefAndUnlock();
2942
2943		// remove the marker
2944		queueLocker.Lock();
2945		nextPage = queue.Next(&marker);
2946		queue.Remove(&marker);
2947	}
2948
2949	time = system_time() - time;
2950	TRACE_DAEMON("  ->   active scan (%7" B_PRId64 " us): scanned: %7" B_PRIu32
2951		", moved: %" B_PRIu32 " -> inactive, encountered %" B_PRIu32 " accessed"
2952		" ones\n", time, pagesScanned, pagesToInactive, pagesAccessed);
2953}
2954
2955
2956static void
2957page_daemon_idle_scan(page_stats& pageStats)
2958{
2959	TRACE_DAEMON("page daemon: idle run\n");
2960
2961	if (pageStats.totalFreePages < (int32)sFreePagesTarget) {
2962		// We want more actually free pages, so free some from the cached
2963		// ones.
2964		uint32 freed = free_cached_pages(
2965			sFreePagesTarget - pageStats.totalFreePages, false);
2966		if (freed > 0)
2967			unreserve_pages(freed);
2968		get_page_stats(pageStats);
2969	}
2970
2971	// Walk the active list and move pages to the inactive queue.
2972	get_page_stats(pageStats);
2973	idle_scan_active_pages(pageStats);
2974}
2975
2976
2977static void
2978page_daemon_full_scan(page_stats& pageStats, int32 despairLevel)
2979{
2980	TRACE_DAEMON("page daemon: full run: free: %" B_PRIu32 ", cached: %"
2981		B_PRIu32 ", to free: %" B_PRIu32 "\n", pageStats.totalFreePages,
2982		pageStats.cachedPages, pageStats.unsatisfiedReservations
2983			+ sFreeOrCachedPagesTarget
2984			- (pageStats.totalFreePages + pageStats.cachedPages));
2985
2986	// Walk the inactive list and transfer pages to the cached and modified
2987	// queues.
2988	full_scan_inactive_pages(pageStats, despairLevel);
2989
2990	// Free cached pages. Also wake up reservation waiters.
2991	get_page_stats(pageStats);
2992	int32 pagesToFree = pageStats.unsatisfiedReservations + sFreePagesTarget
2993		- (pageStats.totalFreePages);
2994	if (pagesToFree > 0) {
2995		uint32 freed = free_cached_pages(pagesToFree, true);
2996		if (freed > 0)
2997			unreserve_pages(freed);
2998	}
2999
3000	// Walk the active list and move pages to the inactive queue.
3001	get_page_stats(pageStats);
3002	full_scan_active_pages(pageStats, despairLevel);
3003}
3004
3005
3006static status_t
3007page_daemon(void* /*unused*/)
3008{
3009	int32 despairLevel = 0;
3010
3011	while (true) {
3012		sPageDaemonCondition.ClearActivated();
3013
3014		// evaluate the free pages situation
3015		page_stats pageStats;
3016		get_page_stats(pageStats);
3017
3018		if (!do_active_paging(pageStats)) {
3019			// Things look good -- just maintain statistics and keep the pool
3020			// of actually free pages full enough.
3021			despairLevel = 0;
3022			page_daemon_idle_scan(pageStats);
3023			sPageDaemonCondition.Wait(kIdleScanWaitInterval, false);
3024		} else {
3025			// Not enough free pages. We need to do some real work.
3026			despairLevel = std::max(despairLevel + 1, (int32)3);
3027			page_daemon_full_scan(pageStats, despairLevel);
3028
3029			// Don't wait after the first full scan, but rather immediately
3030			// check whether we were successful in freeing enough pages and
3031			// re-run with increased despair level. The first scan is
3032			// conservative with respect to moving inactive modified pages to
3033			// the modified list to avoid thrashing. The second scan, however,
3034			// will not hold back.
3035			if (despairLevel > 1)
3036				snooze(kBusyScanWaitInterval);
3037		}
3038	}
3039
3040	return B_OK;
3041}
3042
3043
3044/*!	Returns how many pages could *not* be reserved.
3045*/
3046static uint32
3047reserve_pages(uint32 count, int priority, bool dontWait)
3048{
3049	int32 dontTouch = kPageReserveForPriority[priority];
3050
3051	while (true) {
3052		count -= reserve_some_pages(count, dontTouch);
3053		if (count == 0)
3054			return 0;
3055
3056		if (sUnsatisfiedPageReservations == 0) {
3057			count -= free_cached_pages(count, dontWait);
3058			if (count == 0)
3059				return count;
3060		}
3061
3062		if (dontWait)
3063			return count;
3064
3065		// we need to wait for pages to become available
3066
3067		MutexLocker pageDeficitLocker(sPageDeficitLock);
3068
3069		bool notifyDaemon = sUnsatisfiedPageReservations == 0;
3070		sUnsatisfiedPageReservations += count;
3071
3072		if (atomic_get(&sUnreservedFreePages) > dontTouch) {
3073			// the situation changed
3074			sUnsatisfiedPageReservations -= count;
3075			continue;
3076		}
3077
3078		PageReservationWaiter waiter;
3079		waiter.dontTouch = dontTouch;
3080		waiter.missing = count;
3081		waiter.thread = thread_get_current_thread();
3082		waiter.threadPriority = waiter.thread->priority;
3083
3084		// insert ordered (i.e. after all waiters with higher or equal priority)
3085		PageReservationWaiter* otherWaiter = NULL;
3086		for (PageReservationWaiterList::Iterator it
3087				= sPageReservationWaiters.GetIterator();
3088			(otherWaiter = it.Next()) != NULL;) {
3089			if (waiter < *otherWaiter)
3090				break;
3091		}
3092
3093		sPageReservationWaiters.InsertBefore(otherWaiter, &waiter);
3094
3095		thread_prepare_to_block(waiter.thread, 0, THREAD_BLOCK_TYPE_OTHER,
3096			"waiting for pages");
3097
3098		if (notifyDaemon)
3099			sPageDaemonCondition.WakeUp();
3100
3101		pageDeficitLocker.Unlock();
3102
3103		low_resource(B_KERNEL_RESOURCE_PAGES, count, B_RELATIVE_TIMEOUT, 0);
3104		thread_block();
3105
3106		pageDeficitLocker.Lock();
3107
3108		return 0;
3109	}
3110}
3111
3112
3113//	#pragma mark - private kernel API
3114
3115
3116/*!	Writes a range of modified pages of a cache to disk.
3117	You need to hold the VMCache lock when calling this function.
3118	Note that the cache lock is released in this function.
3119	\param cache The cache.
3120	\param firstPage Offset (in page size units) of the first page in the range.
3121	\param endPage End offset (in page size units) of the page range. The page
3122		at this offset is not included.
3123*/
3124status_t
3125vm_page_write_modified_page_range(struct VMCache* cache, uint32 firstPage,
3126	uint32 endPage)
3127{
3128	static const int32 kMaxPages = 256;
3129	int32 maxPages = cache->MaxPagesPerWrite();
3130	if (maxPages < 0 || maxPages > kMaxPages)
3131		maxPages = kMaxPages;
3132
3133	const uint32 allocationFlags = HEAP_DONT_WAIT_FOR_MEMORY
3134		| HEAP_DONT_LOCK_KERNEL_SPACE;
3135
3136	PageWriteWrapper stackWrappersPool[2];
3137	PageWriteWrapper* stackWrappers[1];
3138	PageWriteWrapper* wrapperPool
3139		= new(malloc_flags(allocationFlags)) PageWriteWrapper[maxPages + 1];
3140	PageWriteWrapper** wrappers
3141		= new(malloc_flags(allocationFlags)) PageWriteWrapper*[maxPages];
3142	if (wrapperPool == NULL || wrappers == NULL) {
3143		// don't fail, just limit our capabilities
3144		delete[] wrapperPool;
3145		delete[] wrappers;
3146		wrapperPool = stackWrappersPool;
3147		wrappers = stackWrappers;
3148		maxPages = 1;
3149	}
3150
3151	int32 nextWrapper = 0;
3152	int32 usedWrappers = 0;
3153
3154	PageWriteTransfer transfer;
3155	bool transferEmpty = true;
3156
3157	VMCachePagesTree::Iterator it
3158		= cache->pages.GetIterator(firstPage, true, true);
3159
3160	while (true) {
3161		vm_page* page = it.Next();
3162		if (page == NULL || page->cache_offset >= endPage) {
3163			if (transferEmpty)
3164				break;
3165
3166			page = NULL;
3167		}
3168
3169		if (page != NULL) {
3170			if (page->busy
3171				|| (page->State() != PAGE_STATE_MODIFIED
3172					&& !vm_test_map_modification(page))) {
3173				page = NULL;
3174			}
3175		}
3176
3177		PageWriteWrapper* wrapper = NULL;
3178		if (page != NULL) {
3179			wrapper = &wrapperPool[nextWrapper++];
3180			if (nextWrapper > maxPages)
3181				nextWrapper = 0;
3182
3183			DEBUG_PAGE_ACCESS_START(page);
3184
3185			wrapper->SetTo(page);
3186
3187			if (transferEmpty || transfer.AddPage(page)) {
3188				if (transferEmpty) {
3189					transfer.SetTo(NULL, page, maxPages);
3190					transferEmpty = false;
3191				}
3192
3193				DEBUG_PAGE_ACCESS_END(page);
3194
3195				wrappers[usedWrappers++] = wrapper;
3196				continue;
3197			}
3198
3199			DEBUG_PAGE_ACCESS_END(page);
3200		}
3201
3202		if (transferEmpty)
3203			continue;
3204
3205		cache->Unlock();
3206		status_t status = transfer.Schedule(0);
3207		cache->Lock();
3208
3209		for (int32 i = 0; i < usedWrappers; i++)
3210			wrappers[i]->Done(status);
3211
3212		usedWrappers = 0;
3213
3214		if (page != NULL) {
3215			transfer.SetTo(NULL, page, maxPages);
3216			wrappers[usedWrappers++] = wrapper;
3217		} else
3218			transferEmpty = true;
3219	}
3220
3221	if (wrapperPool != stackWrappersPool) {
3222		delete[] wrapperPool;
3223		delete[] wrappers;
3224	}
3225
3226	return B_OK;
3227}
3228
3229
3230/*!	You need to hold the VMCache lock when calling this function.
3231	Note that the cache lock is released in this function.
3232*/
3233status_t
3234vm_page_write_modified_pages(VMCache *cache)
3235{
3236	return vm_page_write_modified_page_range(cache, 0,
3237		(cache->virtual_end + B_PAGE_SIZE - 1) >> PAGE_SHIFT);
3238}
3239
3240
3241/*!	Schedules the page writer to write back the specified \a page.
3242	Note, however, that it might not do this immediately, and it can well
3243	take several seconds until the page is actually written out.
3244*/
3245void
3246vm_page_schedule_write_page(vm_page *page)
3247{
3248	PAGE_ASSERT(page, page->State() == PAGE_STATE_MODIFIED);
3249
3250	vm_page_requeue(page, false);
3251
3252	sPageWriterCondition.WakeUp();
3253}
3254
3255
3256/*!	Cache must be locked.
3257*/
3258void
3259vm_page_schedule_write_page_range(struct VMCache *cache, uint32 firstPage,
3260	uint32 endPage)
3261{
3262	uint32 modified = 0;
3263	for (VMCachePagesTree::Iterator it
3264				= cache->pages.GetIterator(firstPage, true, true);
3265			vm_page *page = it.Next();) {
3266		if (page->cache_offset >= endPage)
3267			break;
3268
3269		if (!page->busy && page->State() == PAGE_STATE_MODIFIED) {
3270			DEBUG_PAGE_ACCESS_START(page);
3271			vm_page_requeue(page, false);
3272			modified++;
3273			DEBUG_PAGE_ACCESS_END(page);
3274		}
3275	}
3276
3277	if (modified > 0)
3278		sPageWriterCondition.WakeUp();
3279}
3280
3281
3282void
3283vm_page_init_num_pages(kernel_args *args)
3284{
3285	// calculate the size of memory by looking at the physical_memory_range array
3286	sPhysicalPageOffset = args->physical_memory_range[0].start / B_PAGE_SIZE;
3287	page_num_t physicalPagesEnd = sPhysicalPageOffset
3288		+ args->physical_memory_range[0].size / B_PAGE_SIZE;
3289
3290	sNonExistingPages = 0;
3291	sIgnoredPages = args->ignored_physical_memory / B_PAGE_SIZE;
3292
3293	for (uint32 i = 1; i < args->num_physical_memory_ranges; i++) {
3294		page_num_t start = args->physical_memory_range[i].start / B_PAGE_SIZE;
3295		if (start > physicalPagesEnd)
3296			sNonExistingPages += start - physicalPagesEnd;
3297		physicalPagesEnd = start
3298			+ args->physical_memory_range[i].size / B_PAGE_SIZE;
3299
3300#ifdef LIMIT_AVAILABLE_MEMORY
3301		page_num_t available
3302			= physicalPagesEnd - sPhysicalPageOffset - sNonExistingPages;
3303		if (available > LIMIT_AVAILABLE_MEMORY * (1024 * 1024 / B_PAGE_SIZE)) {
3304			physicalPagesEnd = sPhysicalPageOffset + sNonExistingPages
3305				+ LIMIT_AVAILABLE_MEMORY * (1024 * 1024 / B_PAGE_SIZE);
3306			break;
3307		}
3308#endif
3309	}
3310
3311	TRACE(("first phys page = %#" B_PRIxPHYSADDR ", end %#" B_PRIxPHYSADDR "\n",
3312		sPhysicalPageOffset, physicalPagesEnd));
3313
3314	sNumPages = physicalPagesEnd - sPhysicalPageOffset;
3315}
3316
3317
3318status_t
3319vm_page_init(kernel_args *args)
3320{
3321	TRACE(("vm_page_init: entry\n"));
3322
3323	// init page queues
3324	sModifiedPageQueue.Init("modified pages queue");
3325	sInactivePageQueue.Init("inactive pages queue");
3326	sActivePageQueue.Init("active pages queue");
3327	sCachedPageQueue.Init("cached pages queue");
3328	sFreePageQueue.Init("free pages queue");
3329	sClearPageQueue.Init("clear pages queue");
3330
3331	new (&sPageReservationWaiters) PageReservationWaiterList;
3332
3333	// map in the new free page table
3334	sPages = (vm_page *)vm_allocate_early(args, sNumPages * sizeof(vm_page),
3335		~0L, B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA, 0);
3336
3337	TRACE(("vm_init: putting free_page_table @ %p, # ents %" B_PRIuPHYSADDR
3338		" (size %#" B_PRIxPHYSADDR ")\n", sPages, sNumPages,
3339		(phys_addr_t)(sNumPages * sizeof(vm_page))));
3340
3341	// initialize the free page table
3342	for (uint32 i = 0; i < sNumPages; i++) {
3343		sPages[i].Init(sPhysicalPageOffset + i);
3344		sFreePageQueue.Append(&sPages[i]);
3345
3346#if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
3347		sPages[i].allocation_tracking_info.Clear();
3348#endif
3349	}
3350
3351	sUnreservedFreePages = sNumPages;
3352
3353	TRACE(("initialized table\n"));
3354
3355	// mark the ranges between usable physical memory unused
3356	phys_addr_t previousEnd = 0;
3357	for (uint32 i = 0; i < args->num_physical_memory_ranges; i++) {
3358		phys_addr_t base = args->physical_memory_range[i].start;
3359		phys_size_t size = args->physical_memory_range[i].size;
3360		if (base > previousEnd) {
3361			mark_page_range_in_use(previousEnd / B_PAGE_SIZE,
3362				(base - previousEnd) / B_PAGE_SIZE, false);
3363		}
3364		previousEnd = base + size;
3365	}
3366
3367	// mark the allocated physical page ranges wired
3368	for (uint32 i = 0; i < args->num_physical_allocated_ranges; i++) {
3369		mark_page_range_in_use(
3370			args->physical_allocated_range[i].start / B_PAGE_SIZE,
3371			args->physical_allocated_range[i].size / B_PAGE_SIZE, true);
3372	}
3373
3374	// The target of actually free pages. This must be at least the system
3375	// reserve, but should be a few more pages, so we don't have to extract
3376	// a cached page with each allocation.
3377	sFreePagesTarget = VM_PAGE_RESERVE_USER
3378		+ std::max((page_num_t)32, (sNumPages - sNonExistingPages) / 1024);
3379
3380	// The target of free + cached and inactive pages. On low-memory machines
3381	// keep things tight. free + cached is the pool of immediately allocatable
3382	// pages. We want a few inactive pages, so when we're actually paging, we
3383	// have a reasonably large set of pages to work with.
3384	if (sUnreservedFreePages < 16 * 1024) {
3385		sFreeOrCachedPagesTarget = sFreePagesTarget + 128;
3386		sInactivePagesTarget = sFreePagesTarget / 3;
3387	} else {
3388		sFreeOrCachedPagesTarget = 2 * sFreePagesTarget;
3389		sInactivePagesTarget = sFreePagesTarget / 2;
3390	}
3391
3392	TRACE(("vm_page_init: exit\n"));
3393
3394	return B_OK;
3395}
3396
3397
3398status_t
3399vm_page_init_post_area(kernel_args *args)
3400{
3401	void *dummy;
3402
3403	dummy = sPages;
3404	create_area("page structures", &dummy, B_EXACT_ADDRESS,
3405		PAGE_ALIGN(sNumPages * sizeof(vm_page)), B_ALREADY_WIRED,
3406		B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
3407
3408	add_debugger_command("list_pages", &dump_page_list,
3409		"List physical pages");
3410	add_debugger_command("page_stats", &dump_page_stats,
3411		"Dump statistics about page usage");
3412	add_debugger_command_etc("page", &dump_page_long,
3413		"Dump page info",
3414		"[ \"-p\" | \"-v\" ] [ \"-m\" ] <address>\n"
3415		"Prints information for the physical page. If neither \"-p\" nor\n"
3416		"\"-v\" are given, the provided address is interpreted as address of\n"
3417		"the vm_page data structure for the page in question. If \"-p\" is\n"
3418		"given, the address is the physical address of the page. If \"-v\" is\n"
3419		"given, the address is interpreted as virtual address in the current\n"
3420		"thread's address space and for the page it is mapped to (if any)\n"
3421		"information are printed. If \"-m\" is specified, the command will\n"
3422		"search all known address spaces for mappings to that page and print\n"
3423		"them.\n", 0);
3424	add_debugger_command("page_queue", &dump_page_queue, "Dump page queue");
3425	add_debugger_command("find_page", &find_page,
3426		"Find out which queue a page is actually in");
3427
3428#ifdef TRACK_PAGE_USAGE_STATS
3429	add_debugger_command_etc("page_usage", &dump_page_usage_stats,
3430		"Dumps statistics about page usage counts",
3431		"\n"
3432		"Dumps statistics about page usage counts.\n",
3433		B_KDEBUG_DONT_PARSE_ARGUMENTS);
3434#endif
3435
3436#if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
3437	add_debugger_command_etc("page_allocations_per_caller",
3438		&dump_page_allocations_per_caller,
3439		"Dump current page allocations summed up per caller",
3440		"[ -d <caller> ] [ -r ]\n"
3441		"The current allocations will by summed up by caller (their count)\n"
3442		"printed in decreasing order by count.\n"
3443		"If \"-d\" is given, each allocation for caller <caller> is printed\n"
3444		"including the respective stack trace.\n"
3445		"If \"-r\" is given, the allocation infos are reset after gathering\n"
3446		"the information, so the next command invocation will only show the\n"
3447		"allocations made after the reset.\n", 0);
3448	add_debugger_command_etc("page_allocation_infos",
3449		&dump_page_allocation_infos,
3450		"Dump current page allocations",
3451		"[ --stacktrace ] [ -p <page number> ] [ --team <team ID> ] "
3452		"[ --thread <thread ID> ]\n"
3453		"The current allocations filtered by optional values will be printed.\n"
3454		"The optional \"-p\" page number filters for a specific page,\n"
3455		"with \"--team\" and \"--thread\" allocations by specific teams\n"
3456		"and/or threads can be filtered (these only work if a corresponding\n"
3457		"tracing entry is still available).\n"
3458		"If \"--stacktrace\" is given, then stack traces of the allocation\n"
3459		"callers are printed, where available\n", 0);
3460#endif
3461
3462	return B_OK;
3463}
3464
3465
3466status_t
3467vm_page_init_post_thread(kernel_args *args)
3468{
3469	new (&sFreePageCondition) ConditionVariable;
3470
3471	// create a kernel thread to clear out pages
3472
3473	thread_id thread = spawn_kernel_thread(&page_scrubber, "page scrubber",
3474		B_LOWEST_ACTIVE_PRIORITY, NULL);
3475	resume_thread(thread);
3476
3477	// start page writer
3478
3479	sPageWriterCondition.Init("page writer");
3480
3481	thread = spawn_kernel_thread(&page_writer, "page writer",
3482		B_NORMAL_PRIORITY + 1, NULL);
3483	resume_thread(thread);
3484
3485	// start page daemon
3486
3487	sPageDaemonCondition.Init("page daemon");
3488
3489	thread = spawn_kernel_thread(&page_daemon, "page daemon",
3490		B_NORMAL_PRIORITY, NULL);
3491	resume_thread(thread);
3492
3493	return B_OK;
3494}
3495
3496
3497status_t
3498vm_mark_page_inuse(page_num_t page)
3499{
3500	return vm_mark_page_range_inuse(page, 1);
3501}
3502
3503
3504status_t
3505vm_mark_page_range_inuse(page_num_t startPage, page_num_t length)
3506{
3507	return mark_page_range_in_use(startPage, length, false);
3508}
3509
3510
3511/*!	Unreserve pages previously reserved with vm_page_reserve_pages().
3512*/
3513void
3514vm_page_unreserve_pages(vm_page_reservation* reservation)
3515{
3516	uint32 count = reservation->count;
3517	reservation->count = 0;
3518
3519	if (count == 0)
3520		return;
3521
3522	TA(UnreservePages(count));
3523
3524	unreserve_pages(count);
3525}
3526
3527
3528/*!	With this call, you can reserve a number of free pages in the system.
3529	They will only be handed out to someone who has actually reserved them.
3530	This call returns as soon as the number of requested pages has been
3531	reached.
3532	The caller must not hold any cache lock or the function might deadlock.
3533*/
3534void
3535vm_page_reserve_pages(vm_page_reservation* reservation, uint32 count,
3536	int priority)
3537{
3538	reservation->count = count;
3539
3540	if (count == 0)
3541		return;
3542
3543	TA(ReservePages(count));
3544
3545	reserve_pages(count, priority, false);
3546}
3547
3548
3549bool
3550vm_page_try_reserve_pages(vm_page_reservation* reservation, uint32 count,
3551	int priority)
3552{
3553	if (count == 0) {
3554		reservation->count = count;
3555		return true;
3556	}
3557
3558	uint32 remaining = reserve_pages(count, priority, true);
3559	if (remaining == 0) {
3560		TA(ReservePages(count));
3561		reservation->count = count;
3562		return true;
3563	}
3564
3565	unreserve_pages(count - remaining);
3566
3567	return false;
3568}
3569
3570
3571vm_page *
3572vm_page_allocate_page(vm_page_reservation* reservation, uint32 flags)
3573{
3574	uint32 pageState = flags & VM_PAGE_ALLOC_STATE;
3575	ASSERT(pageState != PAGE_STATE_FREE);
3576	ASSERT(pageState != PAGE_STATE_CLEAR);
3577
3578	ASSERT(reservation->count > 0);
3579	reservation->count--;
3580
3581	VMPageQueue* queue;
3582	VMPageQueue* otherQueue;
3583
3584	if ((flags & VM_PAGE_ALLOC_CLEAR) != 0) {
3585		queue = &sClearPageQueue;
3586		otherQueue = &sFreePageQueue;
3587	} else {
3588		queue = &sFreePageQueue;
3589		otherQueue = &sClearPageQueue;
3590	}
3591
3592	ReadLocker locker(sFreePageQueuesLock);
3593
3594	vm_page* page = queue->RemoveHeadUnlocked();
3595	if (page == NULL) {
3596		// if the primary queue was empty, grab the page from the
3597		// secondary queue
3598		page = otherQueue->RemoveHeadUnlocked();
3599
3600		if (page == NULL) {
3601			// Unlikely, but possible: the page we have reserved has moved
3602			// between the queues after we checked the first queue. Grab the
3603			// write locker to make sure this doesn't happen again.
3604			locker.Unlock();
3605			WriteLocker writeLocker(sFreePageQueuesLock);
3606
3607			page = queue->RemoveHead();
3608			if (page == NULL)
3609				otherQueue->RemoveHead();
3610
3611			if (page == NULL) {
3612				panic("Had reserved page, but there is none!");
3613				return NULL;
3614			}
3615
3616			// downgrade to read lock
3617			locker.Lock();
3618		}
3619	}
3620
3621	if (page->CacheRef() != NULL)
3622		panic("supposed to be free page %p has cache\n", page);
3623
3624	DEBUG_PAGE_ACCESS_START(page);
3625
3626	int oldPageState = page->State();
3627	page->SetState(pageState);
3628	page->busy = (flags & VM_PAGE_ALLOC_BUSY) != 0;
3629	page->usage_count = 0;
3630	page->accessed = false;
3631	page->modified = false;
3632
3633	locker.Unlock();
3634
3635	if (pageState < PAGE_STATE_FIRST_UNQUEUED)
3636		sPageQueues[pageState].AppendUnlocked(page);
3637
3638	// clear the page, if we had to take it from the free queue and a clear
3639	// page was requested
3640	if ((flags & VM_PAGE_ALLOC_CLEAR) != 0 && oldPageState != PAGE_STATE_CLEAR)
3641		clear_page(page);
3642
3643#if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
3644	page->allocation_tracking_info.Init(
3645		TA(AllocatePage(page->physical_page_number)));
3646#else
3647	TA(AllocatePage(page->physical_page_number));
3648#endif
3649
3650	return page;
3651}
3652
3653
3654static void
3655allocate_page_run_cleanup(VMPageQueue::PageList& freePages,
3656	VMPageQueue::PageList& clearPages)
3657{
3658	// Page lists are sorted, so remove tails before prepending to the respective queue.
3659
3660	while (vm_page* page = freePages.RemoveTail()) {
3661		page->busy = false;
3662		page->SetState(PAGE_STATE_FREE);
3663		DEBUG_PAGE_ACCESS_END(page);
3664		sFreePageQueue.PrependUnlocked(page);
3665	}
3666
3667	while (vm_page* page = clearPages.RemoveTail()) {
3668		page->busy = false;
3669		page->SetState(PAGE_STATE_CLEAR);
3670		DEBUG_PAGE_ACCESS_END(page);
3671		sClearPageQueue.PrependUnlocked(page);
3672	}
3673
3674	sFreePageCondition.NotifyAll();
3675}
3676
3677
3678/*!	Tries to allocate the a contiguous run of \a length pages starting at
3679	index \a start.
3680
3681	The caller must have write-locked the free/clear page queues. The function
3682	will unlock regardless of whether it succeeds or fails.
3683
3684	If the function fails, it cleans up after itself, i.e. it will free all
3685	pages it managed to allocate.
3686
3687	\param start The start index (into \c sPages) of the run.
3688	\param length The number of pages to allocate.
3689	\param flags Page allocation flags. Encodes the state the function shall
3690		set the allocated pages to, whether the pages shall be marked busy
3691		(VM_PAGE_ALLOC_BUSY), and whether the pages shall be cleared
3692		(VM_PAGE_ALLOC_CLEAR).
3693	\param freeClearQueueLocker Locked WriteLocker for the free/clear page
3694		queues in locked state. Will be unlocked by the function.
3695	\return The index of the first page that could not be allocated. \a length
3696		is returned when the function was successful.
3697*/
3698static page_num_t
3699allocate_page_run(page_num_t start, page_num_t length, uint32 flags,
3700	WriteLocker& freeClearQueueLocker)
3701{
3702	uint32 pageState = flags & VM_PAGE_ALLOC_STATE;
3703	ASSERT(pageState != PAGE_STATE_FREE);
3704	ASSERT(pageState != PAGE_STATE_CLEAR);
3705	ASSERT(start + length <= sNumPages);
3706
3707	// Pull the free/clear pages out of their respective queues. Cached pages
3708	// are allocated later.
3709	page_num_t cachedPages = 0;
3710	VMPageQueue::PageList freePages;
3711	VMPageQueue::PageList clearPages;
3712	page_num_t i = 0;
3713	for (; i < length; i++) {
3714		bool pageAllocated = true;
3715		bool noPage = false;
3716		vm_page& page = sPages[start + i];
3717		switch (page.State()) {
3718			case PAGE_STATE_CLEAR:
3719				DEBUG_PAGE_ACCESS_START(&page);
3720				sClearPageQueue.Remove(&page);
3721				clearPages.Add(&page);
3722				break;
3723			case PAGE_STATE_FREE:
3724				DEBUG_PAGE_ACCESS_START(&page);
3725				sFreePageQueue.Remove(&page);
3726				freePages.Add(&page);
3727				break;
3728			case PAGE_STATE_CACHED:
3729				// We allocate cached pages later.
3730				cachedPages++;
3731				pageAllocated = false;
3732				break;
3733
3734			default:
3735				// Probably a page was cached when our caller checked. Now it's
3736				// gone and we have to abort.
3737				noPage = true;
3738				break;
3739		}
3740
3741		if (noPage)
3742			break;
3743
3744		if (pageAllocated) {
3745			page.SetState(flags & VM_PAGE_ALLOC_STATE);
3746			page.busy = (flags & VM_PAGE_ALLOC_BUSY) != 0;
3747			page.usage_count = 0;
3748			page.accessed = false;
3749			page.modified = false;
3750		}
3751	}
3752
3753	if (i < length) {
3754		// failed to allocate a page -- free all that we've got
3755		allocate_page_run_cleanup(freePages, clearPages);
3756		return i;
3757	}
3758
3759	freeClearQueueLocker.Unlock();
3760
3761	if (cachedPages > 0) {
3762		// allocate the pages that weren't free but cached
3763		page_num_t freedCachedPages = 0;
3764		page_num_t nextIndex = start;
3765		vm_page* freePage = freePages.Head();
3766		vm_page* clearPage = clearPages.Head();
3767		while (cachedPages > 0) {
3768			// skip, if we've already got the page
3769			if (freePage != NULL && size_t(freePage - sPages) == nextIndex) {
3770				freePage = freePages.GetNext(freePage);
3771				nextIndex++;
3772				continue;
3773			}
3774			if (clearPage != NULL && size_t(clearPage - sPages) == nextIndex) {
3775				clearPage = clearPages.GetNext(clearPage);
3776				nextIndex++;
3777				continue;
3778			}
3779
3780			// free the page, if it is still cached
3781			vm_page& page = sPages[nextIndex];
3782			if (!free_cached_page(&page, false)) {
3783				// TODO: if the page turns out to have been freed already,
3784				// there would be no need to fail
3785				break;
3786			}
3787
3788			page.SetState(flags & VM_PAGE_ALLOC_STATE);
3789			page.busy = (flags & VM_PAGE_ALLOC_BUSY) != 0;
3790			page.usage_count = 0;
3791			page.accessed = false;
3792			page.modified = false;
3793
3794			freePages.InsertBefore(freePage, &page);
3795			freedCachedPages++;
3796			cachedPages--;
3797			nextIndex++;
3798		}
3799
3800		// If we have freed cached pages, we need to balance things.
3801		if (freedCachedPages > 0)
3802			unreserve_pages(freedCachedPages);
3803
3804		if (nextIndex - start < length) {
3805			// failed to allocate all cached pages -- free all that we've got
3806			freeClearQueueLocker.Lock();
3807			allocate_page_run_cleanup(freePages, clearPages);
3808			freeClearQueueLocker.Unlock();
3809
3810			return nextIndex - start;
3811		}
3812	}
3813
3814	// clear pages, if requested
3815	if ((flags & VM_PAGE_ALLOC_CLEAR) != 0) {
3816		for (VMPageQueue::PageList::Iterator it = freePages.GetIterator();
3817				vm_page* page = it.Next();) {
3818			clear_page(page);
3819		}
3820	}
3821
3822	// add pages to target queue
3823	if (pageState < PAGE_STATE_FIRST_UNQUEUED) {
3824		freePages.MoveFrom(&clearPages);
3825		sPageQueues[pageState].AppendUnlocked(freePages, length);
3826	}
3827
3828	// Note: We don't unreserve the pages since we pulled them out of the
3829	// free/clear queues without adjusting sUnreservedFreePages.
3830
3831#if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
3832	AbstractTraceEntryWithStackTrace* traceEntry
3833		= TA(AllocatePageRun(start, length));
3834
3835	for (page_num_t i = start; i < start + length; i++)
3836		sPages[i].allocation_tracking_info.Init(traceEntry);
3837#else
3838	TA(AllocatePageRun(start, length));
3839#endif
3840
3841	return length;
3842}
3843
3844
3845/*! Allocate a physically contiguous range of pages.
3846
3847	\param flags Page allocation flags. Encodes the state the function shall
3848		set the allocated pages to, whether the pages shall be marked busy
3849		(VM_PAGE_ALLOC_BUSY), and whether the pages shall be cleared
3850		(VM_PAGE_ALLOC_CLEAR).
3851	\param length The number of contiguous pages to allocate.
3852	\param restrictions Restrictions to the physical addresses of the page run
3853		to allocate, including \c low_address, the first acceptable physical
3854		address where the page run may start, \c high_address, the last
3855		acceptable physical address where the page run may end (i.e. it must
3856		hold \code runStartAddress + length <= high_address \endcode),
3857		\c alignment, the alignment of the page run start address, and
3858		\c boundary, multiples of which the page run must not cross.
3859		Values set to \c 0 are ignored.
3860	\param priority The page reservation priority (as passed to
3861		vm_page_reserve_pages()).
3862	\return The first page of the allocated page run on success; \c NULL
3863		when the allocation failed.
3864*/
3865vm_page*
3866vm_page_allocate_page_run(uint32 flags, page_num_t length,
3867	const physical_address_restrictions* restrictions, int priority)
3868{
3869	// compute start and end page index
3870	page_num_t requestedStart
3871		= std::max(restrictions->low_address / B_PAGE_SIZE, sPhysicalPageOffset)
3872			- sPhysicalPageOffset;
3873	page_num_t start = requestedStart;
3874	page_num_t end;
3875	if (restrictions->high_address > 0) {
3876		end = std::max(restrictions->high_address / B_PAGE_SIZE,
3877				sPhysicalPageOffset)
3878			- sPhysicalPageOffset;
3879		end = std::min(end, sNumPages);
3880	} else
3881		end = sNumPages;
3882
3883	// compute alignment mask
3884	page_num_t alignmentMask
3885		= std::max(restrictions->alignment / B_PAGE_SIZE, (phys_addr_t)1) - 1;
3886	ASSERT(((alignmentMask + 1) & alignmentMask) == 0);
3887		// alignment must be a power of 2
3888
3889	// compute the boundary mask
3890	uint32 boundaryMask = 0;
3891	if (restrictions->boundary != 0) {
3892		page_num_t boundary = restrictions->boundary / B_PAGE_SIZE;
3893		// boundary must be a power of two and not less than alignment and
3894		// length
3895		ASSERT(((boundary - 1) & boundary) == 0);
3896		ASSERT(boundary >= alignmentMask + 1);
3897		ASSERT(boundary >= length);
3898
3899		boundaryMask = -boundary;
3900	}
3901
3902	vm_page_reservation reservation;
3903	vm_page_reserve_pages(&reservation, length, priority);
3904
3905	WriteLocker freeClearQueueLocker(sFreePageQueuesLock);
3906
3907	// First we try to get a run with free pages only. If that fails, we also
3908	// consider cached pages. If there are only few free pages and many cached
3909	// ones, the odds are that we won't find enough contiguous ones, so we skip
3910	// the first iteration in this case.
3911	int32 freePages = sUnreservedFreePages;
3912	int useCached = freePages > 0 && (page_num_t)freePages > 2 * length ? 0 : 1;
3913
3914	for (;;) {
3915		if (alignmentMask != 0 || boundaryMask != 0) {
3916			page_num_t offsetStart = start + sPhysicalPageOffset;
3917
3918			// enforce alignment
3919			if ((offsetStart & alignmentMask) != 0)
3920				offsetStart = (offsetStart + alignmentMask) & ~alignmentMask;
3921
3922			// enforce boundary
3923			if (boundaryMask != 0 && ((offsetStart ^ (offsetStart
3924				+ length - 1)) & boundaryMask) != 0) {
3925				offsetStart = (offsetStart + length - 1) & boundaryMask;
3926			}
3927
3928			start = offsetStart - sPhysicalPageOffset;
3929		}
3930
3931		if (start + length > end) {
3932			if (useCached == 0) {
3933				// The first iteration with free pages only was unsuccessful.
3934				// Try again also considering cached pages.
3935				useCached = 1;
3936				start = requestedStart;
3937				continue;
3938			}
3939
3940			dprintf("vm_page_allocate_page_run(): Failed to allocate run of "
3941				"length %" B_PRIuPHYSADDR " (%" B_PRIuPHYSADDR " %"
3942				B_PRIuPHYSADDR ") in second iteration (align: %" B_PRIuPHYSADDR
3943				" boundary: %" B_PRIuPHYSADDR ")!\n", length, requestedStart,
3944				end, restrictions->alignment, restrictions->boundary);
3945
3946			freeClearQueueLocker.Unlock();
3947			vm_page_unreserve_pages(&reservation);
3948			return NULL;
3949		}
3950
3951		bool foundRun = true;
3952		page_num_t i;
3953		for (i = 0; i < length; i++) {
3954			uint32 pageState = sPages[start + i].State();
3955			if (pageState != PAGE_STATE_FREE
3956				&& pageState != PAGE_STATE_CLEAR
3957				&& (pageState != PAGE_STATE_CACHED || useCached == 0)) {
3958				foundRun = false;
3959				break;
3960			}
3961		}
3962
3963		if (foundRun) {
3964			i = allocate_page_run(start, length, flags, freeClearQueueLocker);
3965			if (i == length)
3966				return &sPages[start];
3967
3968			// apparently a cached page couldn't be allocated -- skip it and
3969			// continue
3970			freeClearQueueLocker.Lock();
3971		}
3972
3973		start += i + 1;
3974	}
3975}
3976
3977
3978vm_page *
3979vm_page_at_index(int32 index)
3980{
3981	return &sPages[index];
3982}
3983
3984
3985vm_page *
3986vm_lookup_page(page_num_t pageNumber)
3987{
3988	if (pageNumber < sPhysicalPageOffset)
3989		return NULL;
3990
3991	pageNumber -= sPhysicalPageOffset;
3992	if (pageNumber >= sNumPages)
3993		return NULL;
3994
3995	return &sPages[pageNumber];
3996}
3997
3998
3999bool
4000vm_page_is_dummy(struct vm_page *page)
4001{
4002	return page < sPages || page >= sPages + sNumPages;
4003}
4004
4005
4006/*!	Free the page that belonged to a certain cache.
4007	You can use vm_page_set_state() manually if you prefer, but only
4008	if the page does not equal PAGE_STATE_MODIFIED.
4009
4010	\param cache The cache the page was previously owned by or NULL. The page
4011		must have been removed from its cache before calling this method in
4012		either case.
4013	\param page The page to free.
4014	\param reservation If not NULL, the page count of the reservation will be
4015		incremented, thus allowing to allocate another page for the freed one at
4016		a later time.
4017*/
4018void
4019vm_page_free_etc(VMCache* cache, vm_page* page,
4020	vm_page_reservation* reservation)
4021{
4022	PAGE_ASSERT(page, page->State() != PAGE_STATE_FREE
4023		&& page->State() != PAGE_STATE_CLEAR);
4024
4025	if (page->State() == PAGE_STATE_MODIFIED && cache->temporary)
4026		atomic_add(&sModifiedTemporaryPages, -1);
4027
4028	free_page(page, false);
4029	if (reservation == NULL)
4030		unreserve_pages(1);
4031}
4032
4033
4034void
4035vm_page_set_state(vm_page *page, int pageState)
4036{
4037	PAGE_ASSERT(page, page->State() != PAGE_STATE_FREE
4038		&& page->State() != PAGE_STATE_CLEAR);
4039
4040	if (pageState == PAGE_STATE_FREE || pageState == PAGE_STATE_CLEAR) {
4041		free_page(page, pageState == PAGE_STATE_CLEAR);
4042		unreserve_pages(1);
4043	} else
4044		set_page_state(page, pageState);
4045}
4046
4047
4048/*!	Moves a page to either the tail of the head of its current queue,
4049	depending on \a tail.
4050	The page must have a cache and the cache must be locked!
4051*/
4052void
4053vm_page_requeue(struct vm_page *page, bool tail)
4054{
4055	PAGE_ASSERT(page, page->Cache() != NULL);
4056	page->Cache()->AssertLocked();
4057	// DEBUG_PAGE_ACCESS_CHECK(page);
4058		// TODO: This assertion cannot be satisfied by idle_scan_active_pages()
4059		// when it requeues busy pages. The reason is that vm_soft_fault()
4060		// (respectively fault_get_page()) and the file cache keep newly
4061		// allocated pages accessed while they are reading them from disk. It
4062		// would probably be better to change that code and reenable this
4063		// check.
4064
4065	VMPageQueue *queue = NULL;
4066
4067	switch (page->State()) {
4068		case PAGE_STATE_ACTIVE:
4069			queue = &sActivePageQueue;
4070			break;
4071		case PAGE_STATE_INACTIVE:
4072			queue = &sInactivePageQueue;
4073			break;
4074		case PAGE_STATE_MODIFIED:
4075			queue = &sModifiedPageQueue;
4076			break;
4077		case PAGE_STATE_CACHED:
4078			queue = &sCachedPageQueue;
4079			break;
4080		case PAGE_STATE_FREE:
4081		case PAGE_STATE_CLEAR:
4082			panic("vm_page_requeue() called for free/clear page %p", page);
4083			return;
4084		case PAGE_STATE_WIRED:
4085		case PAGE_STATE_UNUSED:
4086			return;
4087		default:
4088			panic("vm_page_touch: vm_page %p in invalid state %d\n",
4089				page, page->State());
4090			break;
4091	}
4092
4093	queue->RequeueUnlocked(page, tail);
4094}
4095
4096
4097page_num_t
4098vm_page_num_pages(void)
4099{
4100	return sNumPages - sNonExistingPages;
4101}
4102
4103
4104/*! There is a subtle distinction between the page counts returned by
4105	this function and vm_page_num_free_pages():
4106	The latter returns the number of pages that are completely uncommitted,
4107	whereas this one returns the number of pages that are available for
4108	use by being reclaimed as well (IOW it factors in things like cache pages
4109	as available).
4110*/
4111page_num_t
4112vm_page_num_available_pages(void)
4113{
4114	return vm_available_memory() / B_PAGE_SIZE;
4115}
4116
4117
4118page_num_t
4119vm_page_num_free_pages(void)
4120{
4121	int32 count = sUnreservedFreePages + sCachedPageQueue.Count();
4122	return count > 0 ? count : 0;
4123}
4124
4125
4126page_num_t
4127vm_page_num_unused_pages(void)
4128{
4129	int32 count = sUnreservedFreePages;
4130	return count > 0 ? count : 0;
4131}
4132
4133
4134void
4135vm_page_get_stats(system_info *info)
4136{
4137	// Note: there's no locking protecting any of the queues or counters here,
4138	// so we run the risk of getting bogus values when evaluating them
4139	// throughout this function. As these stats are for informational purposes
4140	// only, it is not really worth introducing such locking. Therefore we just
4141	// ensure that we don't under- or overflow any of the values.
4142
4143	// The pages used for the block cache buffers. Those should not be counted
4144	// as used but as cached pages.
4145	// TODO: We should subtract the blocks that are in use ATM, since those
4146	// can't really be freed in a low memory situation.
4147	page_num_t blockCachePages = block_cache_used_memory() / B_PAGE_SIZE;
4148	info->block_cache_pages = blockCachePages;
4149
4150	// Non-temporary modified pages are special as they represent pages that
4151	// can be written back, so they could be freed if necessary, for us
4152	// basically making them into cached pages with a higher overhead. The
4153	// modified queue count is therefore split into temporary and non-temporary
4154	// counts that are then added to the corresponding number.
4155	page_num_t modifiedNonTemporaryPages
4156		= (sModifiedPageQueue.Count() - sModifiedTemporaryPages);
4157
4158	info->max_pages = vm_page_num_pages();
4159	info->cached_pages = sCachedPageQueue.Count() + modifiedNonTemporaryPages
4160		+ blockCachePages;
4161
4162	// max_pages is composed of:
4163	//	active + inactive + unused + wired + modified + cached + free + clear
4164	// So taking out the cached (including modified non-temporary), free and
4165	// clear ones leaves us with all used pages.
4166	uint32 subtractPages = info->cached_pages + sFreePageQueue.Count()
4167		+ sClearPageQueue.Count();
4168	info->used_pages = subtractPages > info->max_pages
4169		? 0 : info->max_pages - subtractPages;
4170
4171	if (info->used_pages + info->cached_pages > info->max_pages) {
4172		// Something was shuffled around while we were summing up the counts.
4173		// Make the values sane, preferring the worse case of more used pages.
4174		info->cached_pages = info->max_pages - info->used_pages;
4175	}
4176
4177	info->page_faults = vm_num_page_faults();
4178	info->ignored_pages = sIgnoredPages;
4179
4180	// TODO: We don't consider pages used for page directories/tables yet.
4181}
4182
4183
4184/*!	Returns the greatest address within the last page of accessible physical
4185	memory.
4186	The value is inclusive, i.e. in case of a 32 bit phys_addr_t 0xffffffff
4187	means the that the last page ends at exactly 4 GB.
4188*/
4189phys_addr_t
4190vm_page_max_address()
4191{
4192	return ((phys_addr_t)sPhysicalPageOffset + sNumPages) * B_PAGE_SIZE - 1;
4193}
4194
4195
4196RANGE_MARKER_FUNCTION_END(vm_page)
4197