1/*
2 * Copyright 2008, Zhao Shuai, upczhsh@163.com.
3 * Copyright 2008-2011, Ingo Weinhold, ingo_weinhold@gmx.de.
4 * Copyright 2002-2009, Axel D��rfler, axeld@pinc-software.de.
5 * Distributed under the terms of the MIT License.
6 *
7 * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
8 * Distributed under the terms of the NewOS License.
9 *
10 * Copyright 2011-2012 Haiku, Inc. All rights reserved.
11 * Distributed under the terms of the MIT License.
12 *
13 * Authors:
14 *		Hamish Morrison, hamish@lavabit.com
15 *		Alexander von Gluck IV, kallisti5@unixzen.com
16 */
17
18
19#include "VMAnonymousCache.h"
20
21#include <errno.h>
22#include <fcntl.h>
23#include <stdlib.h>
24#include <string.h>
25#include <unistd.h>
26
27#include <FindDirectory.h>
28#include <KernelExport.h>
29#include <NodeMonitor.h>
30
31#include <arch_config.h>
32#include <boot_device.h>
33#include <disk_device_manager/KDiskDevice.h>
34#include <disk_device_manager/KDiskDeviceManager.h>
35#include <disk_device_manager/KDiskSystem.h>
36#include <disk_device_manager/KPartitionVisitor.h>
37#include <driver_settings.h>
38#include <fs/fd.h>
39#include <fs/KPath.h>
40#include <fs_info.h>
41#include <fs_interface.h>
42#include <heap.h>
43#include <kernel_daemon.h>
44#include <slab/Slab.h>
45#include <syscalls.h>
46#include <system_info.h>
47#include <thread.h>
48#include <tracing.h>
49#include <util/AutoLock.h>
50#include <util/Bitmap.h>
51#include <util/DoublyLinkedList.h>
52#include <util/OpenHashTable.h>
53#include <util/RadixBitmap.h>
54#include <vfs.h>
55#include <vm/vm.h>
56#include <vm/vm_page.h>
57#include <vm/vm_priv.h>
58#include <vm/VMAddressSpace.h>
59
60#include "IORequest.h"
61#include "VMUtils.h"
62
63
64#if	ENABLE_SWAP_SUPPORT
65
66//#define TRACE_VM_ANONYMOUS_CACHE
67#ifdef TRACE_VM_ANONYMOUS_CACHE
68#	define TRACE(x...) dprintf(x)
69#else
70#	define TRACE(x...) do { } while (false)
71#endif
72
73
74// number of free swap blocks the object cache shall minimally have
75#define MIN_SWAP_BLOCK_RESERVE	4096
76
77// interval the has resizer is triggered (in 0.1s)
78#define SWAP_HASH_RESIZE_INTERVAL	5
79
80#define INITIAL_SWAP_HASH_SIZE		1024
81
82#define SWAP_SLOT_NONE	RADIX_SLOT_NONE
83
84#define SWAP_BLOCK_PAGES 32
85#define SWAP_BLOCK_SHIFT 5		/* 1 << SWAP_BLOCK_SHIFT == SWAP_BLOCK_PAGES */
86#define SWAP_BLOCK_MASK  (SWAP_BLOCK_PAGES - 1)
87
88
89static const char* const kDefaultSwapPath = "/var/swap";
90
91struct swap_file : DoublyLinkedListLinkImpl<swap_file> {
92	int				fd;
93	struct vnode*	vnode;
94	void*			cookie;
95	swap_addr_t		first_slot;
96	swap_addr_t		last_slot;
97	radix_bitmap*	bmp;
98};
99
100struct swap_hash_key {
101	VMAnonymousCache	*cache;
102	off_t				page_index;  // page index in the cache
103};
104
105// Each swap block contains swap address information for
106// SWAP_BLOCK_PAGES continuous pages from the same cache
107struct swap_block {
108	swap_block*		hash_link;
109	swap_hash_key	key;
110	uint32			used;
111	swap_addr_t		swap_slots[SWAP_BLOCK_PAGES];
112};
113
114struct SwapHashTableDefinition {
115	typedef swap_hash_key KeyType;
116	typedef swap_block ValueType;
117
118	SwapHashTableDefinition() {}
119
120	size_t HashKey(const swap_hash_key& key) const
121	{
122		off_t blockIndex = key.page_index >> SWAP_BLOCK_SHIFT;
123		VMAnonymousCache* cache = key.cache;
124		return blockIndex ^ (size_t)(int*)cache;
125	}
126
127	size_t Hash(const swap_block* value) const
128	{
129		return HashKey(value->key);
130	}
131
132	bool Compare(const swap_hash_key& key, const swap_block* value) const
133	{
134		return (key.page_index & ~(off_t)SWAP_BLOCK_MASK)
135				== (value->key.page_index & ~(off_t)SWAP_BLOCK_MASK)
136			&& key.cache == value->key.cache;
137	}
138
139	swap_block*& GetLink(swap_block* value) const
140	{
141		return value->hash_link;
142	}
143};
144
145typedef BOpenHashTable<SwapHashTableDefinition> SwapHashTable;
146typedef DoublyLinkedList<swap_file> SwapFileList;
147
148static SwapHashTable sSwapHashTable;
149static rw_lock sSwapHashLock;
150
151static SwapFileList sSwapFileList;
152static mutex sSwapFileListLock;
153static swap_file* sSwapFileAlloc = NULL; // allocate from here
154static uint32 sSwapFileCount = 0;
155
156static off_t sAvailSwapSpace = 0;
157static mutex sAvailSwapSpaceLock;
158
159static object_cache* sSwapBlockCache;
160
161
162#if SWAP_TRACING
163namespace SwapTracing {
164
165class SwapTraceEntry : public AbstractTraceEntry {
166public:
167	SwapTraceEntry(VMAnonymousCache* cache)
168		:
169		fCache(cache)
170	{
171	}
172
173protected:
174	VMAnonymousCache*	fCache;
175};
176
177
178class ReadPage : public SwapTraceEntry {
179public:
180	ReadPage(VMAnonymousCache* cache, page_num_t pageIndex,
181		swap_addr_t swapSlotIndex)
182		:
183		SwapTraceEntry(cache),
184		fPageIndex(pageIndex),
185		fSwapSlotIndex(swapSlotIndex)
186	{
187		Initialized();
188	}
189
190	virtual void AddDump(TraceOutput& out)
191	{
192		out.Print("swap read:  cache %p, page index: %lu <- swap slot: %lu",
193			fCache, fPageIndex, fSwapSlotIndex);
194	}
195
196private:
197	page_num_t		fPageIndex;
198	swap_addr_t		fSwapSlotIndex;
199};
200
201
202class WritePage : public SwapTraceEntry {
203public:
204	WritePage(VMAnonymousCache* cache, page_num_t pageIndex,
205		swap_addr_t swapSlotIndex)
206		:
207		SwapTraceEntry(cache),
208		fPageIndex(pageIndex),
209		fSwapSlotIndex(swapSlotIndex)
210	{
211		Initialized();
212	}
213
214	virtual void AddDump(TraceOutput& out)
215	{
216		out.Print("swap write: cache %p, page index: %lu -> swap slot: %lu",
217			fCache, fPageIndex, fSwapSlotIndex);
218	}
219
220private:
221	page_num_t		fPageIndex;
222	swap_addr_t		fSwapSlotIndex;
223};
224
225}	// namespace SwapTracing
226
227#	define T(x) new(std::nothrow) SwapTracing::x;
228#else
229#	define T(x) ;
230#endif
231
232
233static int
234dump_swap_info(int argc, char** argv)
235{
236	swap_addr_t totalSwapPages = 0;
237	swap_addr_t freeSwapPages = 0;
238
239	kprintf("swap files:\n");
240
241	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
242		swap_file* file = it.Next();) {
243		swap_addr_t total = file->last_slot - file->first_slot;
244		kprintf("  vnode: %p, pages: total: %" B_PRIu32 ", free: %" B_PRIu32
245			"\n", file->vnode, total, file->bmp->free_slots);
246
247		totalSwapPages += total;
248		freeSwapPages += file->bmp->free_slots;
249	}
250
251	kprintf("\n");
252	kprintf("swap space in pages:\n");
253	kprintf("total:     %9" B_PRIu32 "\n", totalSwapPages);
254	kprintf("available: %9" B_PRIdOFF "\n", sAvailSwapSpace / B_PAGE_SIZE);
255	kprintf("reserved:  %9" B_PRIdOFF "\n",
256		totalSwapPages - sAvailSwapSpace / B_PAGE_SIZE);
257	kprintf("used:      %9" B_PRIu32 "\n", totalSwapPages - freeSwapPages);
258	kprintf("free:      %9" B_PRIu32 "\n", freeSwapPages);
259
260	return 0;
261}
262
263
264static swap_addr_t
265swap_slot_alloc(uint32 count)
266{
267	mutex_lock(&sSwapFileListLock);
268
269	if (sSwapFileList.IsEmpty()) {
270		mutex_unlock(&sSwapFileListLock);
271		panic("swap_slot_alloc(): no swap file in the system\n");
272		return SWAP_SLOT_NONE;
273	}
274
275	// since radix bitmap could not handle more than 32 pages, we return
276	// SWAP_SLOT_NONE, this forces Write() adjust allocation amount
277	if (count > BITMAP_RADIX) {
278		mutex_unlock(&sSwapFileListLock);
279		return SWAP_SLOT_NONE;
280	}
281
282	swap_addr_t j, addr = SWAP_SLOT_NONE;
283	for (j = 0; j < sSwapFileCount; j++) {
284		if (sSwapFileAlloc == NULL)
285			sSwapFileAlloc = sSwapFileList.First();
286
287		addr = radix_bitmap_alloc(sSwapFileAlloc->bmp, count);
288		if (addr != SWAP_SLOT_NONE) {
289			addr += sSwapFileAlloc->first_slot;
290			break;
291		}
292
293		// this swap_file is full, find another
294		sSwapFileAlloc = sSwapFileList.GetNext(sSwapFileAlloc);
295	}
296
297	if (j == sSwapFileCount) {
298		mutex_unlock(&sSwapFileListLock);
299		panic("swap_slot_alloc: swap space exhausted!\n");
300		return SWAP_SLOT_NONE;
301	}
302
303	// if this swap file has used more than 90% percent of its space
304	// switch to another
305	if (sSwapFileAlloc->bmp->free_slots
306		< (sSwapFileAlloc->last_slot - sSwapFileAlloc->first_slot) / 10) {
307		sSwapFileAlloc = sSwapFileList.GetNext(sSwapFileAlloc);
308	}
309
310	mutex_unlock(&sSwapFileListLock);
311
312	return addr;
313}
314
315
316static swap_file*
317find_swap_file(swap_addr_t slotIndex)
318{
319	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
320		swap_file* swapFile = it.Next();) {
321		if (slotIndex >= swapFile->first_slot
322			&& slotIndex < swapFile->last_slot) {
323			return swapFile;
324		}
325	}
326
327	panic("find_swap_file(): can't find swap file for slot %" B_PRIu32 "\n",
328		slotIndex);
329	return NULL;
330}
331
332
333static void
334swap_slot_dealloc(swap_addr_t slotIndex, uint32 count)
335{
336	if (slotIndex == SWAP_SLOT_NONE)
337		return;
338
339	mutex_lock(&sSwapFileListLock);
340	swap_file* swapFile = find_swap_file(slotIndex);
341	slotIndex -= swapFile->first_slot;
342	radix_bitmap_dealloc(swapFile->bmp, slotIndex, count);
343	mutex_unlock(&sSwapFileListLock);
344}
345
346
347static off_t
348swap_space_reserve(off_t amount)
349{
350	mutex_lock(&sAvailSwapSpaceLock);
351	if (sAvailSwapSpace >= amount)
352		sAvailSwapSpace -= amount;
353	else {
354		amount = sAvailSwapSpace;
355		sAvailSwapSpace = 0;
356	}
357	mutex_unlock(&sAvailSwapSpaceLock);
358
359	return amount;
360}
361
362
363static void
364swap_space_unreserve(off_t amount)
365{
366	mutex_lock(&sAvailSwapSpaceLock);
367	sAvailSwapSpace += amount;
368	mutex_unlock(&sAvailSwapSpaceLock);
369}
370
371
372static void
373swap_hash_resizer(void*, int)
374{
375	WriteLocker locker(sSwapHashLock);
376
377	size_t size;
378	void* allocation;
379
380	do {
381		size = sSwapHashTable.ResizeNeeded();
382		if (size == 0)
383			return;
384
385		locker.Unlock();
386
387		allocation = malloc(size);
388		if (allocation == NULL)
389			return;
390
391		locker.Lock();
392
393	} while (!sSwapHashTable.Resize(allocation, size));
394}
395
396
397// #pragma mark -
398
399
400class VMAnonymousCache::WriteCallback : public StackableAsyncIOCallback {
401public:
402	WriteCallback(VMAnonymousCache* cache, AsyncIOCallback* callback)
403		:
404		StackableAsyncIOCallback(callback),
405		fCache(cache)
406	{
407	}
408
409	void SetTo(page_num_t pageIndex, swap_addr_t slotIndex, bool newSlot)
410	{
411		fPageIndex = pageIndex;
412		fSlotIndex = slotIndex;
413		fNewSlot = newSlot;
414	}
415
416	virtual void IOFinished(status_t status, bool partialTransfer,
417		generic_size_t bytesTransferred)
418	{
419		if (fNewSlot) {
420			if (status == B_OK) {
421				fCache->_SwapBlockBuild(fPageIndex, fSlotIndex, 1);
422			} else {
423				AutoLocker<VMCache> locker(fCache);
424				fCache->fAllocatedSwapSize -= B_PAGE_SIZE;
425				locker.Unlock();
426
427				swap_slot_dealloc(fSlotIndex, 1);
428			}
429		}
430
431		fNextCallback->IOFinished(status, partialTransfer, bytesTransferred);
432
433		delete this;
434	}
435
436private:
437	VMAnonymousCache*	fCache;
438	page_num_t			fPageIndex;
439	swap_addr_t			fSlotIndex;
440	bool				fNewSlot;
441};
442
443
444// #pragma mark -
445
446
447VMAnonymousCache::~VMAnonymousCache()
448{
449	delete fNoSwapPages;
450	fNoSwapPages = NULL;
451
452	_FreeSwapPageRange(virtual_base, virtual_end, false);
453	swap_space_unreserve(fCommittedSwapSize);
454	if (committed_size > fCommittedSwapSize)
455		vm_unreserve_memory(committed_size - fCommittedSwapSize);
456}
457
458
459status_t
460VMAnonymousCache::Init(bool canOvercommit, int32 numPrecommittedPages,
461	int32 numGuardPages, uint32 allocationFlags)
462{
463	TRACE("%p->VMAnonymousCache::Init(canOvercommit = %s, "
464		"numPrecommittedPages = %" B_PRId32 ", numGuardPages = %" B_PRId32
465		")\n", this, canOvercommit ? "yes" : "no", numPrecommittedPages,
466		numGuardPages);
467
468	status_t error = VMCache::Init(CACHE_TYPE_RAM, allocationFlags);
469	if (error != B_OK)
470		return error;
471
472	fCanOvercommit = canOvercommit;
473	fHasPrecommitted = false;
474	fPrecommittedPages = min_c(numPrecommittedPages, 255);
475	fNoSwapPages = NULL;
476	fGuardedSize = numGuardPages * B_PAGE_SIZE;
477	fCommittedSwapSize = 0;
478	fAllocatedSwapSize = 0;
479
480	return B_OK;
481}
482
483
484status_t
485VMAnonymousCache::SetCanSwapPages(off_t base, size_t size, bool canSwap)
486{
487	const page_num_t first = base >> PAGE_SHIFT;
488	const size_t count = PAGE_ALIGN(size + ((first << PAGE_SHIFT) - base)) >> PAGE_SHIFT;
489
490	if (count == 0)
491		return B_OK;
492	if (canSwap && fNoSwapPages == NULL)
493		return B_OK;
494
495	if (fNoSwapPages == NULL)
496		fNoSwapPages = new(std::nothrow) Bitmap(0);
497	if (fNoSwapPages == NULL)
498		return B_NO_MEMORY;
499
500	const page_num_t pageCount = PAGE_ALIGN(virtual_end) >> PAGE_SHIFT;
501
502	if (fNoSwapPages->Resize(pageCount) != B_OK)
503		return B_NO_MEMORY;
504
505	for (size_t i = 0; i < count; i++) {
506		if (canSwap)
507			fNoSwapPages->Clear(first + i);
508		else
509			fNoSwapPages->Set(first + i);
510	}
511
512	if (fNoSwapPages->GetHighestSet() < 0) {
513		delete fNoSwapPages;
514		fNoSwapPages = NULL;
515	}
516	return B_OK;
517}
518
519
520void
521VMAnonymousCache::_FreeSwapPageRange(off_t fromOffset, off_t toOffset,
522	bool skipBusyPages)
523{
524	swap_block* swapBlock = NULL;
525	off_t toIndex = toOffset >> PAGE_SHIFT;
526	for (off_t pageIndex = fromOffset >> PAGE_SHIFT;
527		pageIndex < toIndex && fAllocatedSwapSize > 0; pageIndex++) {
528
529		WriteLocker locker(sSwapHashLock);
530
531		// Get the swap slot index for the page.
532		swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
533		if (swapBlock == NULL || blockIndex == 0) {
534			swap_hash_key key = { this, pageIndex };
535			swapBlock = sSwapHashTable.Lookup(key);
536
537			if (swapBlock == NULL) {
538				pageIndex = ROUNDUP(pageIndex + 1, SWAP_BLOCK_PAGES) - 1;
539				continue;
540			}
541		}
542
543		swap_addr_t slotIndex = swapBlock->swap_slots[blockIndex];
544		if (slotIndex == SWAP_SLOT_NONE)
545			continue;
546
547		if (skipBusyPages) {
548			vm_page* page = LookupPage(pageIndex * B_PAGE_SIZE);
549			if (page != NULL && page->busy) {
550				// TODO: We skip (i.e. leak) swap space of busy pages, since
551				// there could be I/O going on (paging in/out). Waiting is
552				// not an option as 1. unlocking the cache means that new
553				// swap pages could be added in a range we've already
554				// cleared (since the cache still has the old size) and 2.
555				// we'd risk a deadlock in case we come from the file cache
556				// and the FS holds the node's write-lock. We should mark
557				// the page invalid and let the one responsible clean up.
558				// There's just no such mechanism yet.
559				continue;
560			}
561		}
562
563		swap_slot_dealloc(slotIndex, 1);
564		fAllocatedSwapSize -= B_PAGE_SIZE;
565
566		swapBlock->swap_slots[blockIndex] = SWAP_SLOT_NONE;
567		if (--swapBlock->used == 0) {
568			// All swap pages have been freed -- we can discard the swap block.
569			sSwapHashTable.RemoveUnchecked(swapBlock);
570			object_cache_free(sSwapBlockCache, swapBlock,
571				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
572
573			// There are no swap pages for possibly remaining pages, skip to the
574			// next block.
575			pageIndex = ROUNDUP(pageIndex + 1, SWAP_BLOCK_PAGES) - 1;
576			swapBlock = NULL;
577		}
578	}
579}
580
581
582status_t
583VMAnonymousCache::Resize(off_t newSize, int priority)
584{
585	if (fNoSwapPages != NULL) {
586		if (fNoSwapPages->Resize(PAGE_ALIGN(newSize) >> PAGE_SHIFT) != B_OK)
587			return B_NO_MEMORY;
588	}
589
590	_FreeSwapPageRange(newSize + B_PAGE_SIZE - 1,
591		virtual_end + B_PAGE_SIZE - 1);
592	return VMCache::Resize(newSize, priority);
593}
594
595
596status_t
597VMAnonymousCache::Rebase(off_t newBase, int priority)
598{
599	if (fNoSwapPages != NULL) {
600		const ssize_t sizeDifference = (newBase >> PAGE_SHIFT) - (virtual_base >> PAGE_SHIFT);
601		fNoSwapPages->Shift(sizeDifference);
602	}
603
604	_FreeSwapPageRange(virtual_base, newBase);
605	return VMCache::Rebase(newBase, priority);
606}
607
608
609status_t
610VMAnonymousCache::Discard(off_t offset, off_t size)
611{
612	_FreeSwapPageRange(offset, offset + size);
613	return VMCache::Discard(offset, size);
614}
615
616
617/*!	Moves the swap pages for the given range from the source cache into this
618	cache. Both caches must be locked.
619*/
620status_t
621VMAnonymousCache::Adopt(VMCache* _source, off_t offset, off_t size,
622	off_t newOffset)
623{
624	VMAnonymousCache* source = dynamic_cast<VMAnonymousCache*>(_source);
625	if (source == NULL) {
626		panic("VMAnonymousCache::Adopt(): adopt from incompatible cache %p "
627			"requested", _source);
628		return B_ERROR;
629	}
630
631	off_t pageIndex = newOffset >> PAGE_SHIFT;
632	off_t sourcePageIndex = offset >> PAGE_SHIFT;
633	off_t sourceEndPageIndex = (offset + size + B_PAGE_SIZE - 1) >> PAGE_SHIFT;
634	swap_block* swapBlock = NULL;
635
636	WriteLocker locker(sSwapHashLock);
637
638	while (sourcePageIndex < sourceEndPageIndex
639			&& source->fAllocatedSwapSize > 0) {
640		swap_addr_t left
641			= SWAP_BLOCK_PAGES - (sourcePageIndex & SWAP_BLOCK_MASK);
642
643		swap_hash_key sourceKey = { source, sourcePageIndex };
644		swap_block* sourceSwapBlock = sSwapHashTable.Lookup(sourceKey);
645		if (sourceSwapBlock == NULL || sourceSwapBlock->used == 0) {
646			sourcePageIndex += left;
647			pageIndex += left;
648			swapBlock = NULL;
649			continue;
650		}
651
652		for (; left > 0 && sourceSwapBlock->used > 0;
653				left--, sourcePageIndex++, pageIndex++) {
654
655			swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
656			if (swapBlock == NULL || blockIndex == 0) {
657				swap_hash_key key = { this, pageIndex };
658				swapBlock = sSwapHashTable.Lookup(key);
659
660				if (swapBlock == NULL) {
661					swapBlock = (swap_block*)object_cache_alloc(sSwapBlockCache,
662						CACHE_DONT_WAIT_FOR_MEMORY
663							| CACHE_DONT_LOCK_KERNEL_SPACE);
664					if (swapBlock == NULL)
665						return B_NO_MEMORY;
666
667					swapBlock->key.cache = this;
668					swapBlock->key.page_index
669						= pageIndex & ~(off_t)SWAP_BLOCK_MASK;
670					swapBlock->used = 0;
671					for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++)
672						swapBlock->swap_slots[i] = SWAP_SLOT_NONE;
673
674					sSwapHashTable.InsertUnchecked(swapBlock);
675				}
676			}
677
678			swap_addr_t sourceBlockIndex = sourcePageIndex & SWAP_BLOCK_MASK;
679			swap_addr_t slotIndex
680				= sourceSwapBlock->swap_slots[sourceBlockIndex];
681			if (slotIndex == SWAP_SLOT_NONE)
682				continue;
683
684			ASSERT(swapBlock->swap_slots[blockIndex] == SWAP_SLOT_NONE);
685
686			swapBlock->swap_slots[blockIndex] = slotIndex;
687			swapBlock->used++;
688			fAllocatedSwapSize += B_PAGE_SIZE;
689
690			sourceSwapBlock->swap_slots[sourceBlockIndex] = SWAP_SLOT_NONE;
691			sourceSwapBlock->used--;
692			source->fAllocatedSwapSize -= B_PAGE_SIZE;
693
694			TRACE("adopted slot %#" B_PRIx32 " from %p at page %" B_PRIdOFF
695				" to %p at page %" B_PRIdOFF "\n", slotIndex, source,
696				sourcePageIndex, this, pageIndex);
697		}
698
699		if (left > 0) {
700			sourcePageIndex += left;
701			pageIndex += left;
702			swapBlock = NULL;
703		}
704
705		if (sourceSwapBlock->used == 0) {
706			// All swap pages have been adopted, we can discard the swap block.
707			sSwapHashTable.RemoveUnchecked(sourceSwapBlock);
708			object_cache_free(sSwapBlockCache, sourceSwapBlock,
709				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
710		}
711	}
712
713	locker.Unlock();
714
715	return VMCache::Adopt(source, offset, size, newOffset);
716}
717
718
719status_t
720VMAnonymousCache::Commit(off_t size, int priority)
721{
722	TRACE("%p->VMAnonymousCache::Commit(%" B_PRIdOFF ")\n", this, size);
723
724	// If we can overcommit, we don't commit here, but in Fault(). We always
725	// unreserve memory, if we're asked to shrink our commitment, though.
726	if (fCanOvercommit && size > committed_size) {
727		if (fHasPrecommitted)
728			return B_OK;
729
730		// pre-commit some pages to make a later failure less probable
731		fHasPrecommitted = true;
732		uint32 precommitted = fPrecommittedPages * B_PAGE_SIZE;
733		if (size > precommitted)
734			size = precommitted;
735	}
736
737	return _Commit(size, priority);
738}
739
740
741bool
742VMAnonymousCache::HasPage(off_t offset)
743{
744	if (_SwapBlockGetAddress(offset >> PAGE_SHIFT) != SWAP_SLOT_NONE)
745		return true;
746
747	return false;
748}
749
750
751bool
752VMAnonymousCache::DebugHasPage(off_t offset)
753{
754	off_t pageIndex = offset >> PAGE_SHIFT;
755	swap_hash_key key = { this, pageIndex };
756	swap_block* swap = sSwapHashTable.Lookup(key);
757	if (swap == NULL)
758		return false;
759
760	return swap->swap_slots[pageIndex & SWAP_BLOCK_MASK] != SWAP_SLOT_NONE;
761}
762
763
764status_t
765VMAnonymousCache::Read(off_t offset, const generic_io_vec* vecs, size_t count,
766	uint32 flags, generic_size_t* _numBytes)
767{
768	off_t pageIndex = offset >> PAGE_SHIFT;
769
770	for (uint32 i = 0, j = 0; i < count; i = j) {
771		swap_addr_t startSlotIndex = _SwapBlockGetAddress(pageIndex + i);
772		for (j = i + 1; j < count; j++) {
773			swap_addr_t slotIndex = _SwapBlockGetAddress(pageIndex + j);
774			if (slotIndex != startSlotIndex + j - i)
775				break;
776		}
777
778		T(ReadPage(this, pageIndex, startSlotIndex));
779			// TODO: Assumes that only one page is read.
780
781		swap_file* swapFile = find_swap_file(startSlotIndex);
782
783		off_t pos = (off_t)(startSlotIndex - swapFile->first_slot)
784			* B_PAGE_SIZE;
785
786		status_t status = vfs_read_pages(swapFile->vnode, swapFile->cookie, pos,
787			vecs + i, j - i, flags, _numBytes);
788		if (status != B_OK)
789			return status;
790	}
791
792	return B_OK;
793}
794
795
796status_t
797VMAnonymousCache::Write(off_t offset, const generic_io_vec* vecs, size_t count,
798	uint32 flags, generic_size_t* _numBytes)
799{
800	off_t pageIndex = offset >> PAGE_SHIFT;
801
802	AutoLocker<VMCache> locker(this);
803
804	page_num_t totalPages = 0;
805	for (uint32 i = 0; i < count; i++) {
806		page_num_t pageCount = (vecs[i].length + B_PAGE_SIZE - 1) >> PAGE_SHIFT;
807		swap_addr_t slotIndex = _SwapBlockGetAddress(pageIndex + totalPages);
808		if (slotIndex != SWAP_SLOT_NONE) {
809			swap_slot_dealloc(slotIndex, pageCount);
810			_SwapBlockFree(pageIndex + totalPages, pageCount);
811			fAllocatedSwapSize -= pageCount * B_PAGE_SIZE;
812		}
813
814		totalPages += pageCount;
815	}
816
817	off_t totalSize = totalPages * B_PAGE_SIZE;
818	if (fAllocatedSwapSize + totalSize > fCommittedSwapSize)
819		return B_ERROR;
820
821	fAllocatedSwapSize += totalSize;
822	locker.Unlock();
823
824	page_num_t pagesLeft = totalPages;
825	totalPages = 0;
826
827	for (uint32 i = 0; i < count; i++) {
828		page_num_t pageCount = (vecs[i].length + B_PAGE_SIZE - 1) >> PAGE_SHIFT;
829
830		generic_addr_t vectorBase = vecs[i].base;
831		generic_size_t vectorLength = vecs[i].length;
832		page_num_t n = pageCount;
833
834		for (page_num_t j = 0; j < pageCount; j += n) {
835			swap_addr_t slotIndex;
836			// try to allocate n slots, if fail, try to allocate n/2
837			while ((slotIndex = swap_slot_alloc(n)) == SWAP_SLOT_NONE && n >= 2)
838				n >>= 1;
839
840			if (slotIndex == SWAP_SLOT_NONE)
841				panic("VMAnonymousCache::Write(): can't allocate swap space\n");
842
843			T(WritePage(this, pageIndex, slotIndex));
844				// TODO: Assumes that only one page is written.
845
846			swap_file* swapFile = find_swap_file(slotIndex);
847
848			off_t pos = (off_t)(slotIndex - swapFile->first_slot) * B_PAGE_SIZE;
849
850			generic_size_t length = (phys_addr_t)n * B_PAGE_SIZE;
851			generic_io_vec vector[1];
852			vector->base = vectorBase;
853			vector->length = length;
854
855			status_t status = vfs_write_pages(swapFile->vnode, swapFile->cookie,
856				pos, vector, 1, flags, &length);
857			if (status != B_OK) {
858				locker.Lock();
859				fAllocatedSwapSize -= (off_t)pagesLeft * B_PAGE_SIZE;
860				locker.Unlock();
861
862				swap_slot_dealloc(slotIndex, n);
863				return status;
864			}
865
866			_SwapBlockBuild(pageIndex + totalPages, slotIndex, n);
867			pagesLeft -= n;
868
869			if (n != pageCount) {
870				vectorBase = vectorBase + n * B_PAGE_SIZE;
871				vectorLength -= n * B_PAGE_SIZE;
872			}
873		}
874
875		totalPages += pageCount;
876	}
877
878	ASSERT(pagesLeft == 0);
879	return B_OK;
880}
881
882
883status_t
884VMAnonymousCache::WriteAsync(off_t offset, const generic_io_vec* vecs,
885	size_t count, generic_size_t numBytes, uint32 flags,
886	AsyncIOCallback* _callback)
887{
888	// TODO: Currently this method is only used for single pages. Either make
889	// more flexible use of it or change the interface!
890	// This implementation relies on the current usage!
891	ASSERT(count == 1);
892	ASSERT(numBytes <= B_PAGE_SIZE);
893
894	page_num_t pageIndex = offset >> PAGE_SHIFT;
895	swap_addr_t slotIndex = _SwapBlockGetAddress(pageIndex);
896	bool newSlot = slotIndex == SWAP_SLOT_NONE;
897
898	// If the page doesn't have any swap space yet, allocate it.
899	if (newSlot) {
900		AutoLocker<VMCache> locker(this);
901		if (fAllocatedSwapSize + B_PAGE_SIZE > fCommittedSwapSize) {
902			_callback->IOFinished(B_ERROR, true, 0);
903			return B_ERROR;
904		}
905
906		fAllocatedSwapSize += B_PAGE_SIZE;
907
908		slotIndex = swap_slot_alloc(1);
909	}
910
911	// create our callback
912	WriteCallback* callback = (flags & B_VIP_IO_REQUEST) != 0
913		? new(malloc_flags(HEAP_PRIORITY_VIP)) WriteCallback(this, _callback)
914		: new(std::nothrow) WriteCallback(this, _callback);
915	if (callback == NULL) {
916		if (newSlot) {
917			AutoLocker<VMCache> locker(this);
918			fAllocatedSwapSize -= B_PAGE_SIZE;
919			locker.Unlock();
920
921			swap_slot_dealloc(slotIndex, 1);
922		}
923		_callback->IOFinished(B_NO_MEMORY, true, 0);
924		return B_NO_MEMORY;
925	}
926	// TODO: If the page already had swap space assigned, we don't need an own
927	// callback.
928
929	callback->SetTo(pageIndex, slotIndex, newSlot);
930
931	T(WritePage(this, pageIndex, slotIndex));
932
933	// write the page asynchrounously
934	swap_file* swapFile = find_swap_file(slotIndex);
935	off_t pos = (off_t)(slotIndex - swapFile->first_slot) * B_PAGE_SIZE;
936
937	return vfs_asynchronous_write_pages(swapFile->vnode, swapFile->cookie, pos,
938		vecs, 1, numBytes, flags, callback);
939}
940
941
942bool
943VMAnonymousCache::CanWritePage(off_t offset)
944{
945	const off_t pageIndex = offset >> PAGE_SHIFT;
946	if (fNoSwapPages != NULL && fNoSwapPages->Get(pageIndex))
947		return false;
948
949	// We can write the page, if we have not used all of our committed swap
950	// space or the page already has a swap slot assigned.
951	return fAllocatedSwapSize < fCommittedSwapSize
952		|| _SwapBlockGetAddress(pageIndex) != SWAP_SLOT_NONE;
953}
954
955
956int32
957VMAnonymousCache::MaxPagesPerAsyncWrite() const
958{
959	return 1;
960}
961
962
963status_t
964VMAnonymousCache::Fault(struct VMAddressSpace* aspace, off_t offset)
965{
966	if (fGuardedSize > 0) {
967		uint32 guardOffset;
968
969#ifdef STACK_GROWS_DOWNWARDS
970		guardOffset = 0;
971#elif defined(STACK_GROWS_UPWARDS)
972		guardOffset = virtual_size - fGuardedSize;
973#else
974#	error Stack direction has not been defined in arch_config.h
975#endif
976		// report stack fault, guard page hit!
977		if (offset >= guardOffset && offset < guardOffset + fGuardedSize) {
978			TRACE(("stack overflow!\n"));
979			return B_BAD_ADDRESS;
980		}
981	}
982
983	if (fCanOvercommit && LookupPage(offset) == NULL && !HasPage(offset)) {
984		if (fPrecommittedPages == 0) {
985			// never commit more than needed
986			if (committed_size / B_PAGE_SIZE > page_count)
987				return B_BAD_HANDLER;
988
989			// try to commit additional swap space/memory
990			if (swap_space_reserve(B_PAGE_SIZE) == B_PAGE_SIZE) {
991				fCommittedSwapSize += B_PAGE_SIZE;
992			} else {
993				int priority = aspace == VMAddressSpace::Kernel()
994					? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER;
995				if (vm_try_reserve_memory(B_PAGE_SIZE, priority, 0) != B_OK) {
996					dprintf("%p->VMAnonymousCache::Fault(): Failed to reserve "
997						"%d bytes of RAM.\n", this, (int)B_PAGE_SIZE);
998					return B_NO_MEMORY;
999				}
1000			}
1001
1002			committed_size += B_PAGE_SIZE;
1003		} else
1004			fPrecommittedPages--;
1005	}
1006
1007	// This will cause vm_soft_fault() to handle the fault
1008	return B_BAD_HANDLER;
1009}
1010
1011
1012void
1013VMAnonymousCache::Merge(VMCache* _source)
1014{
1015	VMAnonymousCache* source = dynamic_cast<VMAnonymousCache*>(_source);
1016	if (source == NULL) {
1017		panic("VMAnonymousCache::Merge(): merge with incompatible cache "
1018			"%p requested", _source);
1019		return;
1020	}
1021
1022	// take over the source' committed size
1023	fCommittedSwapSize += source->fCommittedSwapSize;
1024	source->fCommittedSwapSize = 0;
1025	committed_size += source->committed_size;
1026	source->committed_size = 0;
1027
1028	off_t actualSize = virtual_end - virtual_base;
1029	if (committed_size > actualSize)
1030		_Commit(actualSize, VM_PRIORITY_USER);
1031
1032	// Move all not shadowed swap pages from the source to the consumer cache.
1033	// Also remove all source pages that are shadowed by consumer swap pages.
1034	_MergeSwapPages(source);
1035
1036	// Move all not shadowed pages from the source to the consumer cache.
1037	if (source->page_count < page_count)
1038		_MergePagesSmallerSource(source);
1039	else
1040		_MergePagesSmallerConsumer(source);
1041}
1042
1043
1044void
1045VMAnonymousCache::DeleteObject()
1046{
1047	object_cache_delete(gAnonymousCacheObjectCache, this);
1048}
1049
1050
1051void
1052VMAnonymousCache::_SwapBlockBuild(off_t startPageIndex,
1053	swap_addr_t startSlotIndex, uint32 count)
1054{
1055	WriteLocker locker(sSwapHashLock);
1056
1057	uint32 left = count;
1058	for (uint32 i = 0, j = 0; i < count; i += j) {
1059		off_t pageIndex = startPageIndex + i;
1060		swap_addr_t slotIndex = startSlotIndex + i;
1061
1062		swap_hash_key key = { this, pageIndex };
1063
1064		swap_block* swap = sSwapHashTable.Lookup(key);
1065		while (swap == NULL) {
1066			swap = (swap_block*)object_cache_alloc(sSwapBlockCache,
1067				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
1068			if (swap == NULL) {
1069				// Wait a short time until memory is available again.
1070				locker.Unlock();
1071				snooze(10000);
1072				locker.Lock();
1073				swap = sSwapHashTable.Lookup(key);
1074				continue;
1075			}
1076
1077			swap->key.cache = this;
1078			swap->key.page_index = pageIndex & ~(off_t)SWAP_BLOCK_MASK;
1079			swap->used = 0;
1080			for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++)
1081				swap->swap_slots[i] = SWAP_SLOT_NONE;
1082
1083			sSwapHashTable.InsertUnchecked(swap);
1084		}
1085
1086		swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
1087		for (j = 0; blockIndex < SWAP_BLOCK_PAGES && left > 0; j++) {
1088			swap->swap_slots[blockIndex++] = slotIndex + j;
1089			left--;
1090		}
1091
1092		swap->used += j;
1093	}
1094}
1095
1096
1097void
1098VMAnonymousCache::_SwapBlockFree(off_t startPageIndex, uint32 count)
1099{
1100	WriteLocker locker(sSwapHashLock);
1101
1102	uint32 left = count;
1103	for (uint32 i = 0, j = 0; i < count; i += j) {
1104		off_t pageIndex = startPageIndex + i;
1105		swap_hash_key key = { this, pageIndex };
1106		swap_block* swap = sSwapHashTable.Lookup(key);
1107
1108		ASSERT(swap != NULL);
1109
1110		swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
1111		for (j = 0; blockIndex < SWAP_BLOCK_PAGES && left > 0; j++) {
1112			swap->swap_slots[blockIndex++] = SWAP_SLOT_NONE;
1113			left--;
1114		}
1115
1116		swap->used -= j;
1117		if (swap->used == 0) {
1118			sSwapHashTable.RemoveUnchecked(swap);
1119			object_cache_free(sSwapBlockCache, swap,
1120				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
1121		}
1122	}
1123}
1124
1125
1126swap_addr_t
1127VMAnonymousCache::_SwapBlockGetAddress(off_t pageIndex)
1128{
1129	ReadLocker locker(sSwapHashLock);
1130
1131	swap_hash_key key = { this, pageIndex };
1132	swap_block* swap = sSwapHashTable.Lookup(key);
1133	swap_addr_t slotIndex = SWAP_SLOT_NONE;
1134
1135	if (swap != NULL) {
1136		swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
1137		slotIndex = swap->swap_slots[blockIndex];
1138	}
1139
1140	return slotIndex;
1141}
1142
1143
1144status_t
1145VMAnonymousCache::_Commit(off_t size, int priority)
1146{
1147	TRACE("%p->VMAnonymousCache::_Commit(%" B_PRIdOFF "), already committed: "
1148		"%" B_PRIdOFF " (%" B_PRIdOFF " swap)\n", this, size, committed_size,
1149		fCommittedSwapSize);
1150
1151	// Basic strategy: reserve swap space first, only when running out of swap
1152	// space, reserve real memory.
1153
1154	off_t committedMemory = committed_size - fCommittedSwapSize;
1155
1156	// Regardless of whether we're asked to grow or shrink the commitment,
1157	// we always try to reserve as much as possible of the final commitment
1158	// in the swap space.
1159	if (size > fCommittedSwapSize) {
1160		fCommittedSwapSize += swap_space_reserve(size - fCommittedSwapSize);
1161		committed_size = fCommittedSwapSize + committedMemory;
1162		if (size > fCommittedSwapSize) {
1163			TRACE("%p->VMAnonymousCache::_Commit(%" B_PRIdOFF "), reserved "
1164				"only %" B_PRIdOFF " swap\n", this, size, fCommittedSwapSize);
1165		}
1166	}
1167
1168	if (committed_size == size)
1169		return B_OK;
1170
1171	if (committed_size > size) {
1172		// The commitment shrinks -- unreserve real memory first.
1173		off_t toUnreserve = committed_size - size;
1174		if (committedMemory > 0) {
1175			off_t unreserved = min_c(toUnreserve, committedMemory);
1176			vm_unreserve_memory(unreserved);
1177			committedMemory -= unreserved;
1178			committed_size -= unreserved;
1179			toUnreserve -= unreserved;
1180		}
1181
1182		// Unreserve swap space.
1183		if (toUnreserve > 0) {
1184			swap_space_unreserve(toUnreserve);
1185			fCommittedSwapSize -= toUnreserve;
1186			committed_size -= toUnreserve;
1187		}
1188
1189		return B_OK;
1190	}
1191
1192	// The commitment grows -- we have already tried to reserve swap space at
1193	// the start of the method, so we try to reserve real memory, now.
1194
1195	off_t toReserve = size - committed_size;
1196	if (vm_try_reserve_memory(toReserve, priority, 1000000) != B_OK) {
1197		dprintf("%p->VMAnonymousCache::_Commit(%" B_PRIdOFF "): Failed to "
1198			"reserve %" B_PRIdOFF " bytes of RAM\n", this, size, toReserve);
1199		return B_NO_MEMORY;
1200	}
1201
1202	committed_size = size;
1203	return B_OK;
1204}
1205
1206
1207void
1208VMAnonymousCache::_MergePagesSmallerSource(VMAnonymousCache* source)
1209{
1210	// The source cache has less pages than the consumer (this cache), so we
1211	// iterate through the source's pages and move the ones that are not
1212	// shadowed up to the consumer.
1213
1214	for (VMCachePagesTree::Iterator it = source->pages.GetIterator();
1215			vm_page* page = it.Next();) {
1216		// Note: Removing the current node while iterating through a
1217		// IteratableSplayTree is safe.
1218		vm_page* consumerPage = LookupPage(
1219			(off_t)page->cache_offset << PAGE_SHIFT);
1220		if (consumerPage == NULL) {
1221			// the page is not yet in the consumer cache - move it upwards
1222			ASSERT_PRINT(!page->busy, "page: %p", page);
1223			MovePage(page);
1224		}
1225	}
1226}
1227
1228
1229void
1230VMAnonymousCache::_MergePagesSmallerConsumer(VMAnonymousCache* source)
1231{
1232	// The consumer (this cache) has less pages than the source, so we move the
1233	// consumer's pages to the source (freeing shadowed ones) and finally just
1234	// all pages of the source back to the consumer.
1235
1236	for (VMCachePagesTree::Iterator it = pages.GetIterator();
1237		vm_page* page = it.Next();) {
1238		// If a source page is in the way, remove and free it.
1239		vm_page* sourcePage = source->LookupPage(
1240			(off_t)page->cache_offset << PAGE_SHIFT);
1241		if (sourcePage != NULL) {
1242			DEBUG_PAGE_ACCESS_START(sourcePage);
1243			ASSERT_PRINT(!sourcePage->busy, "page: %p", sourcePage);
1244			ASSERT_PRINT(sourcePage->WiredCount() == 0
1245					&& sourcePage->mappings.IsEmpty(),
1246				"sourcePage: %p, page: %p", sourcePage, page);
1247			source->RemovePage(sourcePage);
1248			vm_page_free(source, sourcePage);
1249		}
1250
1251		// Note: Removing the current node while iterating through a
1252		// IteratableSplayTree is safe.
1253		source->MovePage(page);
1254	}
1255
1256	MoveAllPages(source);
1257}
1258
1259
1260void
1261VMAnonymousCache::_MergeSwapPages(VMAnonymousCache* source)
1262{
1263	// If neither source nor consumer have swap pages, we don't have to do
1264	// anything.
1265	if (source->fAllocatedSwapSize == 0 && fAllocatedSwapSize == 0)
1266		return;
1267
1268	for (off_t offset = source->virtual_base
1269		& ~(off_t)(B_PAGE_SIZE * SWAP_BLOCK_PAGES - 1);
1270		offset < source->virtual_end;
1271		offset += B_PAGE_SIZE * SWAP_BLOCK_PAGES) {
1272
1273		WriteLocker locker(sSwapHashLock);
1274
1275		off_t swapBlockPageIndex = offset >> PAGE_SHIFT;
1276		swap_hash_key key = { source, swapBlockPageIndex };
1277		swap_block* sourceSwapBlock = sSwapHashTable.Lookup(key);
1278
1279		// remove the source swap block -- we will either take over the swap
1280		// space (and the block) or free it
1281		if (sourceSwapBlock != NULL)
1282			sSwapHashTable.RemoveUnchecked(sourceSwapBlock);
1283
1284		key.cache = this;
1285		swap_block* swapBlock = sSwapHashTable.Lookup(key);
1286
1287		locker.Unlock();
1288
1289		// remove all source pages that are shadowed by consumer swap pages
1290		if (swapBlock != NULL) {
1291			for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++) {
1292				if (swapBlock->swap_slots[i] != SWAP_SLOT_NONE) {
1293					vm_page* page = source->LookupPage(
1294						(off_t)(swapBlockPageIndex + i) << PAGE_SHIFT);
1295					if (page != NULL) {
1296						DEBUG_PAGE_ACCESS_START(page);
1297						ASSERT_PRINT(!page->busy, "page: %p", page);
1298						source->RemovePage(page);
1299						vm_page_free(source, page);
1300					}
1301				}
1302			}
1303		}
1304
1305		if (sourceSwapBlock == NULL)
1306			continue;
1307
1308		for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++) {
1309			off_t pageIndex = swapBlockPageIndex + i;
1310			swap_addr_t sourceSlotIndex = sourceSwapBlock->swap_slots[i];
1311
1312			if (sourceSlotIndex == SWAP_SLOT_NONE)
1313				continue;
1314
1315			if ((swapBlock != NULL
1316					&& swapBlock->swap_slots[i] != SWAP_SLOT_NONE)
1317				|| LookupPage((off_t)pageIndex << PAGE_SHIFT) != NULL) {
1318				// The consumer already has a page or a swapped out page
1319				// at this index. So we can free the source swap space.
1320				swap_slot_dealloc(sourceSlotIndex, 1);
1321				sourceSwapBlock->swap_slots[i] = SWAP_SLOT_NONE;
1322				sourceSwapBlock->used--;
1323			}
1324
1325			// We've either freed the source swap page or are going to move it
1326			// to the consumer. At any rate, the source cache doesn't own it
1327			// anymore.
1328			source->fAllocatedSwapSize -= B_PAGE_SIZE;
1329		}
1330
1331		// All source swap pages that have not been freed yet are taken over by
1332		// the consumer.
1333		fAllocatedSwapSize += B_PAGE_SIZE * (off_t)sourceSwapBlock->used;
1334
1335		if (sourceSwapBlock->used == 0) {
1336			// All swap pages have been freed -- we can discard the source swap
1337			// block.
1338			object_cache_free(sSwapBlockCache, sourceSwapBlock,
1339				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
1340		} else if (swapBlock == NULL) {
1341			// We need to take over some of the source's swap pages and there's
1342			// no swap block in the consumer cache. Just take over the source
1343			// swap block.
1344			sourceSwapBlock->key.cache = this;
1345			locker.Lock();
1346			sSwapHashTable.InsertUnchecked(sourceSwapBlock);
1347			locker.Unlock();
1348		} else {
1349			// We need to take over some of the source's swap pages and there's
1350			// already a swap block in the consumer cache. Copy the respective
1351			// swap addresses and discard the source swap block.
1352			for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++) {
1353				if (sourceSwapBlock->swap_slots[i] != SWAP_SLOT_NONE)
1354					swapBlock->swap_slots[i] = sourceSwapBlock->swap_slots[i];
1355			}
1356
1357			object_cache_free(sSwapBlockCache, sourceSwapBlock,
1358				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
1359		}
1360	}
1361}
1362
1363
1364// #pragma mark -
1365
1366
1367// TODO: This can be removed if we get BFS uuid's
1368struct VolumeInfo {
1369	char name[B_FILE_NAME_LENGTH];
1370	char device[B_FILE_NAME_LENGTH];
1371	char filesystem[B_OS_NAME_LENGTH];
1372	off_t capacity;
1373};
1374
1375
1376class PartitionScorer : public KPartitionVisitor {
1377public:
1378	PartitionScorer(VolumeInfo& volumeInfo)
1379		:
1380		fBestPartition(NULL),
1381		fBestScore(-1),
1382		fVolumeInfo(volumeInfo)
1383	{
1384	}
1385
1386	virtual bool VisitPre(KPartition* partition)
1387	{
1388		if (!partition->ContainsFileSystem())
1389			return false;
1390
1391		KPath path;
1392		partition->GetPath(&path);
1393
1394		int score = 0;
1395		if (strcmp(fVolumeInfo.name, partition->ContentName()) == 0)
1396			score += 4;
1397		if (strcmp(fVolumeInfo.device, path.Path()) == 0)
1398			score += 3;
1399		if (fVolumeInfo.capacity == partition->Size())
1400			score += 2;
1401		if (strcmp(fVolumeInfo.filesystem,
1402			partition->DiskSystem()->ShortName()) == 0) {
1403			score += 1;
1404		}
1405		if (score >= 4 && score > fBestScore) {
1406			fBestPartition = partition;
1407			fBestScore = score;
1408		}
1409
1410		return false;
1411	}
1412
1413	KPartition* fBestPartition;
1414
1415private:
1416	int32		fBestScore;
1417	VolumeInfo&	fVolumeInfo;
1418};
1419
1420
1421status_t
1422swap_file_add(const char* path)
1423{
1424	// open the file
1425	int fd = open(path, O_RDWR | O_NOCACHE, S_IRUSR | S_IWUSR);
1426	if (fd < 0)
1427		return errno;
1428
1429	// fstat() it and check whether we can use it
1430	struct stat st;
1431	if (fstat(fd, &st) < 0) {
1432		close(fd);
1433		return errno;
1434	}
1435
1436	if (!(S_ISREG(st.st_mode) || S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
1437		close(fd);
1438		return B_BAD_VALUE;
1439	}
1440
1441	if (st.st_size < B_PAGE_SIZE) {
1442		close(fd);
1443		return B_BAD_VALUE;
1444	}
1445
1446	// get file descriptor, vnode, and cookie
1447	file_descriptor* descriptor = get_fd(get_current_io_context(true), fd);
1448	put_fd(descriptor);
1449
1450	vnode* node = fd_vnode(descriptor);
1451	if (node == NULL) {
1452		close(fd);
1453		return B_BAD_VALUE;
1454	}
1455
1456	// do the allocations and prepare the swap_file structure
1457	swap_file* swap = new(std::nothrow) swap_file;
1458	if (swap == NULL) {
1459		close(fd);
1460		return B_NO_MEMORY;
1461	}
1462
1463	swap->fd = fd;
1464	swap->vnode = node;
1465	swap->cookie = descriptor->cookie;
1466
1467	uint32 pageCount = st.st_size >> PAGE_SHIFT;
1468	swap->bmp = radix_bitmap_create(pageCount);
1469	if (swap->bmp == NULL) {
1470		delete swap;
1471		close(fd);
1472		return B_NO_MEMORY;
1473	}
1474
1475	// set slot index and add this file to swap file list
1476	mutex_lock(&sSwapFileListLock);
1477	// TODO: Also check whether the swap file is already registered!
1478	if (sSwapFileList.IsEmpty()) {
1479		swap->first_slot = 0;
1480		swap->last_slot = pageCount;
1481	} else {
1482		// leave one page gap between two swap files
1483		swap->first_slot = sSwapFileList.Last()->last_slot + 1;
1484		swap->last_slot = swap->first_slot + pageCount;
1485	}
1486	sSwapFileList.Add(swap);
1487	sSwapFileCount++;
1488	mutex_unlock(&sSwapFileListLock);
1489
1490	mutex_lock(&sAvailSwapSpaceLock);
1491	sAvailSwapSpace += (off_t)pageCount * B_PAGE_SIZE;
1492	mutex_unlock(&sAvailSwapSpaceLock);
1493
1494	return B_OK;
1495}
1496
1497
1498status_t
1499swap_file_delete(const char* path)
1500{
1501	vnode* node = NULL;
1502	status_t status = vfs_get_vnode_from_path(path, true, &node);
1503	if (status != B_OK)
1504		return status;
1505
1506	MutexLocker locker(sSwapFileListLock);
1507
1508	swap_file* swapFile = NULL;
1509	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
1510			(swapFile = it.Next()) != NULL;) {
1511		if (swapFile->vnode == node)
1512			break;
1513	}
1514
1515	vfs_put_vnode(node);
1516
1517	if (swapFile == NULL)
1518		return B_ERROR;
1519
1520	// if this file is currently used, we can't delete
1521	// TODO: mark this swap file deleting, and remove it after releasing
1522	// all the swap space
1523	if (swapFile->bmp->free_slots < swapFile->last_slot - swapFile->first_slot)
1524		return B_ERROR;
1525
1526	sSwapFileList.Remove(swapFile);
1527	sSwapFileCount--;
1528	locker.Unlock();
1529
1530	mutex_lock(&sAvailSwapSpaceLock);
1531	sAvailSwapSpace -= (off_t)(swapFile->last_slot - swapFile->first_slot)
1532		* B_PAGE_SIZE;
1533	mutex_unlock(&sAvailSwapSpaceLock);
1534
1535	close(swapFile->fd);
1536	radix_bitmap_destroy(swapFile->bmp);
1537	delete swapFile;
1538
1539	return B_OK;
1540}
1541
1542
1543void
1544swap_init(void)
1545{
1546	// create swap block cache
1547	sSwapBlockCache = create_object_cache("swapblock", sizeof(swap_block),
1548		sizeof(void*), NULL, NULL, NULL);
1549	if (sSwapBlockCache == NULL)
1550		panic("swap_init(): can't create object cache for swap blocks\n");
1551
1552	status_t error = object_cache_set_minimum_reserve(sSwapBlockCache,
1553		MIN_SWAP_BLOCK_RESERVE);
1554	if (error != B_OK) {
1555		panic("swap_init(): object_cache_set_minimum_reserve() failed: %s",
1556			strerror(error));
1557	}
1558
1559	// init swap hash table
1560	sSwapHashTable.Init(INITIAL_SWAP_HASH_SIZE);
1561	rw_lock_init(&sSwapHashLock, "swaphash");
1562
1563	error = register_resource_resizer(swap_hash_resizer, NULL,
1564		SWAP_HASH_RESIZE_INTERVAL);
1565	if (error != B_OK) {
1566		panic("swap_init(): Failed to register swap hash resizer: %s",
1567			strerror(error));
1568	}
1569
1570	// init swap file list
1571	mutex_init(&sSwapFileListLock, "swaplist");
1572	sSwapFileAlloc = NULL;
1573	sSwapFileCount = 0;
1574
1575	// init available swap space
1576	mutex_init(&sAvailSwapSpaceLock, "avail swap space");
1577	sAvailSwapSpace = 0;
1578
1579	add_debugger_command_etc("swap", &dump_swap_info,
1580		"Print infos about the swap usage",
1581		"\n"
1582		"Print infos about the swap usage.\n", 0);
1583}
1584
1585
1586void
1587swap_init_post_modules()
1588{
1589	// Never try to create a swap file on a read-only device - when booting
1590	// from CD, the write overlay is used.
1591	if (gReadOnlyBootDevice)
1592		return;
1593
1594	bool swapEnabled = true;
1595	bool swapAutomatic = true;
1596	off_t swapSize = 0;
1597
1598	dev_t swapDeviceID = -1;
1599	VolumeInfo selectedVolume = {};
1600
1601	void* settings = load_driver_settings("virtual_memory");
1602
1603	if (settings != NULL) {
1604		// We pass a lot of information on the swap device, this is mostly to
1605		// ensure that we are dealing with the same device that was configured.
1606
1607		// TODO: Some kind of BFS uuid would be great here :)
1608		const char* enabled = get_driver_parameter(settings, "vm", NULL, NULL);
1609
1610		if (enabled != NULL) {
1611			swapEnabled = get_driver_boolean_parameter(settings, "vm",
1612				true, false);
1613			swapAutomatic = get_driver_boolean_parameter(settings, "swap_auto",
1614				true, false);
1615
1616			if (swapEnabled && !swapAutomatic) {
1617				const char* size = get_driver_parameter(settings, "swap_size",
1618					NULL, NULL);
1619				const char* volume = get_driver_parameter(settings,
1620					"swap_volume_name", NULL, NULL);
1621				const char* device = get_driver_parameter(settings,
1622					"swap_volume_device", NULL, NULL);
1623				const char* filesystem = get_driver_parameter(settings,
1624					"swap_volume_filesystem", NULL, NULL);
1625				const char* capacity = get_driver_parameter(settings,
1626					"swap_volume_capacity", NULL, NULL);
1627
1628				if (size != NULL && device != NULL && volume != NULL
1629					&& filesystem != NULL && capacity != NULL) {
1630					// User specified a size / volume that seems valid
1631					swapAutomatic = false;
1632					swapSize = atoll(size);
1633					strlcpy(selectedVolume.name, volume,
1634						sizeof(selectedVolume.name));
1635					strlcpy(selectedVolume.device, device,
1636						sizeof(selectedVolume.device));
1637					strlcpy(selectedVolume.filesystem, filesystem,
1638						sizeof(selectedVolume.filesystem));
1639					selectedVolume.capacity = atoll(capacity);
1640				} else {
1641					// Something isn't right with swap config, go auto
1642					swapAutomatic = true;
1643					dprintf("%s: virtual_memory configuration is invalid, "
1644						"using automatic swap\n", __func__);
1645				}
1646			}
1647		}
1648		unload_driver_settings(settings);
1649	}
1650
1651	if (swapAutomatic) {
1652		swapSize = (off_t)vm_page_num_pages() * B_PAGE_SIZE;
1653		if (swapSize <= (1024 * 1024 * 1024)) {
1654			// Memory under 1GB? double the swap
1655			swapSize *= 2;
1656		}
1657		// Automatic swap defaults to the boot device
1658		swapDeviceID = gBootDevice;
1659	}
1660
1661	if (!swapEnabled || swapSize < B_PAGE_SIZE) {
1662		dprintf("%s: virtual_memory is disabled\n", __func__);
1663		return;
1664	}
1665
1666	if (!swapAutomatic && swapDeviceID < 0) {
1667		// If user-specified swap, and no swap device has been chosen yet...
1668		KDiskDeviceManager::CreateDefault();
1669		KDiskDeviceManager* manager = KDiskDeviceManager::Default();
1670		PartitionScorer visitor(selectedVolume);
1671
1672		KDiskDevice* device;
1673		int32 cookie = 0;
1674		while ((device = manager->NextDevice(&cookie)) != NULL) {
1675			if (device->IsReadOnlyMedia() || device->IsWriteOnce()
1676				|| device->IsRemovable()) {
1677				continue;
1678			}
1679			device->VisitEachDescendant(&visitor);
1680		}
1681
1682		if (!visitor.fBestPartition) {
1683			dprintf("%s: Can't find configured swap partition '%s'\n",
1684				__func__, selectedVolume.name);
1685		} else {
1686			if (visitor.fBestPartition->IsMounted())
1687				swapDeviceID = visitor.fBestPartition->VolumeID();
1688			else {
1689				KPath devPath, mountPoint;
1690				visitor.fBestPartition->GetPath(&devPath);
1691				get_mount_point(visitor.fBestPartition, &mountPoint);
1692				const char* mountPath = mountPoint.Path();
1693				mkdir(mountPath, S_IRWXU | S_IRWXG | S_IRWXO);
1694				swapDeviceID = _kern_mount(mountPath, devPath.Path(),
1695					NULL, 0, NULL, 0);
1696				if (swapDeviceID < 0) {
1697					dprintf("%s: Can't mount configured swap partition '%s'\n",
1698						__func__, selectedVolume.name);
1699				}
1700			}
1701		}
1702	}
1703
1704	if (swapDeviceID < 0)
1705		swapDeviceID = gBootDevice;
1706
1707	// We now have a swapDeviceID which is used for the swap file
1708
1709	KPath path;
1710	struct fs_info info;
1711	_kern_read_fs_info(swapDeviceID, &info);
1712	if (swapDeviceID == gBootDevice)
1713		path = kDefaultSwapPath;
1714	else {
1715		vfs_entry_ref_to_path(info.dev, info.root, ".", true, path.LockBuffer(),
1716			path.BufferSize());
1717		path.UnlockBuffer();
1718		path.Append("swap");
1719	}
1720
1721	const char* swapPath = path.Path();
1722
1723	// Swap size limits prevent oversized swap files
1724	if (swapAutomatic) {
1725		off_t existingSwapSize = 0;
1726		struct stat existingSwapStat;
1727		if (stat(swapPath, &existingSwapStat) == 0)
1728			existingSwapSize = existingSwapStat.st_size;
1729
1730		off_t freeSpace = info.free_blocks * info.block_size + existingSwapSize;
1731
1732		// Adjust automatic swap to a maximum of 25% of the free space
1733		if (swapSize > (freeSpace / 4))
1734			swapSize = (freeSpace / 4);
1735	}
1736
1737	// Create swap file
1738	int fd = open(swapPath, O_RDWR | O_CREAT | O_NOCACHE, S_IRUSR | S_IWUSR);
1739	if (fd < 0) {
1740		dprintf("%s: Can't open/create %s: %s\n", __func__,
1741			swapPath, strerror(errno));
1742		return;
1743	}
1744
1745	struct stat stat;
1746	stat.st_size = swapSize;
1747	status_t error = _kern_write_stat(fd, NULL, false, &stat,
1748		sizeof(struct stat), B_STAT_SIZE | B_STAT_SIZE_INSECURE);
1749	if (error != B_OK) {
1750		dprintf("%s: Failed to resize %s to %" B_PRIdOFF " bytes: %s\n",
1751			__func__, swapPath, swapSize, strerror(error));
1752	}
1753
1754	close(fd);
1755
1756	error = swap_file_add(swapPath);
1757	if (error != B_OK) {
1758		dprintf("%s: Failed to add swap file %s: %s\n", __func__, swapPath,
1759			strerror(error));
1760	}
1761}
1762
1763
1764//! Used by page daemon to free swap space.
1765bool
1766swap_free_page_swap_space(vm_page* page)
1767{
1768	VMAnonymousCache* cache = dynamic_cast<VMAnonymousCache*>(page->Cache());
1769	if (cache == NULL)
1770		return false;
1771
1772	swap_addr_t slotIndex = cache->_SwapBlockGetAddress(page->cache_offset);
1773	if (slotIndex == SWAP_SLOT_NONE)
1774		return false;
1775
1776	swap_slot_dealloc(slotIndex, 1);
1777	cache->fAllocatedSwapSize -= B_PAGE_SIZE;
1778	cache->_SwapBlockFree(page->cache_offset, 1);
1779
1780	return true;
1781}
1782
1783
1784uint32
1785swap_available_pages()
1786{
1787	mutex_lock(&sAvailSwapSpaceLock);
1788	uint32 avail = sAvailSwapSpace >> PAGE_SHIFT;
1789	mutex_unlock(&sAvailSwapSpaceLock);
1790
1791	return avail;
1792}
1793
1794
1795uint32
1796swap_total_swap_pages()
1797{
1798	mutex_lock(&sSwapFileListLock);
1799
1800	uint32 totalSwapSlots = 0;
1801	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
1802		swap_file* swapFile = it.Next();) {
1803		totalSwapSlots += swapFile->last_slot - swapFile->first_slot;
1804	}
1805
1806	mutex_unlock(&sSwapFileListLock);
1807
1808	return totalSwapSlots;
1809}
1810
1811
1812#endif	// ENABLE_SWAP_SUPPORT
1813
1814
1815void
1816swap_get_info(system_info* info)
1817{
1818#if ENABLE_SWAP_SUPPORT
1819	MutexLocker locker(sSwapFileListLock);
1820	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
1821		swap_file* swapFile = it.Next();) {
1822		info->max_swap_pages += swapFile->last_slot - swapFile->first_slot;
1823		info->free_swap_pages += swapFile->bmp->free_slots;
1824	}
1825#else
1826	info->max_swap_pages = 0;
1827	info->free_swap_pages = 0;
1828#endif
1829}
1830
1831