1/*
2 * Copyright 2008, Zhao Shuai, upczhsh@163.com.
3 * Copyright 2008-2011, Ingo Weinhold, ingo_weinhold@gmx.de.
4 * Copyright 2002-2009, Axel D��rfler, axeld@pinc-software.de.
5 * Distributed under the terms of the MIT License.
6 *
7 * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
8 * Distributed under the terms of the NewOS License.
9 *
10 * Copyright 2011-2012 Haiku, Inc. All rights reserved.
11 * Distributed under the terms of the MIT License.
12 *
13 * Authors:
14 *		Hamish Morrison, hamish@lavabit.com
15 *		Alexander von Gluck IV, kallisti5@unixzen.com
16 */
17
18
19#include "VMAnonymousCache.h"
20
21#include <errno.h>
22#include <fcntl.h>
23#include <stdlib.h>
24#include <string.h>
25#include <unistd.h>
26
27#include <FindDirectory.h>
28#include <KernelExport.h>
29#include <NodeMonitor.h>
30
31#include <arch_config.h>
32#include <boot_device.h>
33#include <disk_device_manager/KDiskDevice.h>
34#include <disk_device_manager/KDiskDeviceManager.h>
35#include <disk_device_manager/KDiskSystem.h>
36#include <disk_device_manager/KPartitionVisitor.h>
37#include <driver_settings.h>
38#include <fs/fd.h>
39#include <fs/KPath.h>
40#include <fs_info.h>
41#include <fs_interface.h>
42#include <heap.h>
43#include <kernel_daemon.h>
44#include <slab/Slab.h>
45#include <syscalls.h>
46#include <system_info.h>
47#include <tracing.h>
48#include <util/AutoLock.h>
49#include <util/DoublyLinkedList.h>
50#include <util/OpenHashTable.h>
51#include <util/RadixBitmap.h>
52#include <vfs.h>
53#include <vm/vm.h>
54#include <vm/vm_page.h>
55#include <vm/vm_priv.h>
56#include <vm/VMAddressSpace.h>
57
58#include "IORequest.h"
59
60
61#if	ENABLE_SWAP_SUPPORT
62
63//#define TRACE_VM_ANONYMOUS_CACHE
64#ifdef TRACE_VM_ANONYMOUS_CACHE
65#	define TRACE(x...) dprintf(x)
66#else
67#	define TRACE(x...) do { } while (false)
68#endif
69
70
71// number of free swap blocks the object cache shall minimally have
72#define MIN_SWAP_BLOCK_RESERVE	4096
73
74// interval the has resizer is triggered (in 0.1s)
75#define SWAP_HASH_RESIZE_INTERVAL	5
76
77#define INITIAL_SWAP_HASH_SIZE		1024
78
79#define SWAP_SLOT_NONE	RADIX_SLOT_NONE
80
81#define SWAP_BLOCK_PAGES 32
82#define SWAP_BLOCK_SHIFT 5		/* 1 << SWAP_BLOCK_SHIFT == SWAP_BLOCK_PAGES */
83#define SWAP_BLOCK_MASK  (SWAP_BLOCK_PAGES - 1)
84
85
86static const char* const kDefaultSwapPath = "/var/swap";
87
88struct swap_file : DoublyLinkedListLinkImpl<swap_file> {
89	int				fd;
90	struct vnode*	vnode;
91	void*			cookie;
92	swap_addr_t		first_slot;
93	swap_addr_t		last_slot;
94	radix_bitmap*	bmp;
95};
96
97struct swap_hash_key {
98	VMAnonymousCache	*cache;
99	off_t				page_index;  // page index in the cache
100};
101
102// Each swap block contains swap address information for
103// SWAP_BLOCK_PAGES continuous pages from the same cache
104struct swap_block {
105	swap_block*		hash_link;
106	swap_hash_key	key;
107	uint32			used;
108	swap_addr_t		swap_slots[SWAP_BLOCK_PAGES];
109};
110
111struct SwapHashTableDefinition {
112	typedef swap_hash_key KeyType;
113	typedef swap_block ValueType;
114
115	SwapHashTableDefinition() {}
116
117	size_t HashKey(const swap_hash_key& key) const
118	{
119		off_t blockIndex = key.page_index >> SWAP_BLOCK_SHIFT;
120		VMAnonymousCache* cache = key.cache;
121		return blockIndex ^ (size_t)(int*)cache;
122	}
123
124	size_t Hash(const swap_block* value) const
125	{
126		return HashKey(value->key);
127	}
128
129	bool Compare(const swap_hash_key& key, const swap_block* value) const
130	{
131		return (key.page_index & ~(off_t)SWAP_BLOCK_MASK)
132				== (value->key.page_index & ~(off_t)SWAP_BLOCK_MASK)
133			&& key.cache == value->key.cache;
134	}
135
136	swap_block*& GetLink(swap_block* value) const
137	{
138		return value->hash_link;
139	}
140};
141
142typedef BOpenHashTable<SwapHashTableDefinition> SwapHashTable;
143typedef DoublyLinkedList<swap_file> SwapFileList;
144
145static SwapHashTable sSwapHashTable;
146static rw_lock sSwapHashLock;
147
148static SwapFileList sSwapFileList;
149static mutex sSwapFileListLock;
150static swap_file* sSwapFileAlloc = NULL; // allocate from here
151static uint32 sSwapFileCount = 0;
152
153static off_t sAvailSwapSpace = 0;
154static mutex sAvailSwapSpaceLock;
155
156static object_cache* sSwapBlockCache;
157
158
159#if SWAP_TRACING
160namespace SwapTracing {
161
162class SwapTraceEntry : public AbstractTraceEntry {
163public:
164	SwapTraceEntry(VMAnonymousCache* cache)
165		:
166		fCache(cache)
167	{
168	}
169
170protected:
171	VMAnonymousCache*	fCache;
172};
173
174
175class ReadPage : public SwapTraceEntry {
176public:
177	ReadPage(VMAnonymousCache* cache, page_num_t pageIndex,
178		swap_addr_t swapSlotIndex)
179		:
180		SwapTraceEntry(cache),
181		fPageIndex(pageIndex),
182		fSwapSlotIndex(swapSlotIndex)
183	{
184		Initialized();
185	}
186
187	virtual void AddDump(TraceOutput& out)
188	{
189		out.Print("swap read:  cache %p, page index: %lu <- swap slot: %lu",
190			fCache, fPageIndex, fSwapSlotIndex);
191	}
192
193private:
194	page_num_t		fPageIndex;
195	swap_addr_t		fSwapSlotIndex;
196};
197
198
199class WritePage : public SwapTraceEntry {
200public:
201	WritePage(VMAnonymousCache* cache, page_num_t pageIndex,
202		swap_addr_t swapSlotIndex)
203		:
204		SwapTraceEntry(cache),
205		fPageIndex(pageIndex),
206		fSwapSlotIndex(swapSlotIndex)
207	{
208		Initialized();
209	}
210
211	virtual void AddDump(TraceOutput& out)
212	{
213		out.Print("swap write: cache %p, page index: %lu -> swap slot: %lu",
214			fCache, fPageIndex, fSwapSlotIndex);
215	}
216
217private:
218	page_num_t		fPageIndex;
219	swap_addr_t		fSwapSlotIndex;
220};
221
222}	// namespace SwapTracing
223
224#	define T(x) new(std::nothrow) SwapTracing::x;
225#else
226#	define T(x) ;
227#endif
228
229
230static int
231dump_swap_info(int argc, char** argv)
232{
233	swap_addr_t totalSwapPages = 0;
234	swap_addr_t freeSwapPages = 0;
235
236	kprintf("swap files:\n");
237
238	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
239		swap_file* file = it.Next();) {
240		swap_addr_t total = file->last_slot - file->first_slot;
241		kprintf("  vnode: %p, pages: total: %" B_PRIu32 ", free: %" B_PRIu32
242			"\n", file->vnode, total, file->bmp->free_slots);
243
244		totalSwapPages += total;
245		freeSwapPages += file->bmp->free_slots;
246	}
247
248	kprintf("\n");
249	kprintf("swap space in pages:\n");
250	kprintf("total:     %9" B_PRIu32 "\n", totalSwapPages);
251	kprintf("available: %9" B_PRIdOFF "\n", sAvailSwapSpace / B_PAGE_SIZE);
252	kprintf("reserved:  %9" B_PRIdOFF "\n",
253		totalSwapPages - sAvailSwapSpace / B_PAGE_SIZE);
254	kprintf("used:      %9" B_PRIu32 "\n", totalSwapPages - freeSwapPages);
255	kprintf("free:      %9" B_PRIu32 "\n", freeSwapPages);
256
257	return 0;
258}
259
260
261static swap_addr_t
262swap_slot_alloc(uint32 count)
263{
264	mutex_lock(&sSwapFileListLock);
265
266	if (sSwapFileList.IsEmpty()) {
267		mutex_unlock(&sSwapFileListLock);
268		panic("swap_slot_alloc(): no swap file in the system\n");
269		return SWAP_SLOT_NONE;
270	}
271
272	// since radix bitmap could not handle more than 32 pages, we return
273	// SWAP_SLOT_NONE, this forces Write() adjust allocation amount
274	if (count > BITMAP_RADIX) {
275		mutex_unlock(&sSwapFileListLock);
276		return SWAP_SLOT_NONE;
277	}
278
279	swap_addr_t j, addr = SWAP_SLOT_NONE;
280	for (j = 0; j < sSwapFileCount; j++) {
281		if (sSwapFileAlloc == NULL)
282			sSwapFileAlloc = sSwapFileList.First();
283
284		addr = radix_bitmap_alloc(sSwapFileAlloc->bmp, count);
285		if (addr != SWAP_SLOT_NONE) {
286			addr += sSwapFileAlloc->first_slot;
287			break;
288		}
289
290		// this swap_file is full, find another
291		sSwapFileAlloc = sSwapFileList.GetNext(sSwapFileAlloc);
292	}
293
294	if (j == sSwapFileCount) {
295		mutex_unlock(&sSwapFileListLock);
296		panic("swap_slot_alloc: swap space exhausted!\n");
297		return SWAP_SLOT_NONE;
298	}
299
300	// if this swap file has used more than 90% percent of its space
301	// switch to another
302	if (sSwapFileAlloc->bmp->free_slots
303		< (sSwapFileAlloc->last_slot - sSwapFileAlloc->first_slot) / 10) {
304		sSwapFileAlloc = sSwapFileList.GetNext(sSwapFileAlloc);
305	}
306
307	mutex_unlock(&sSwapFileListLock);
308
309	return addr;
310}
311
312
313static swap_file*
314find_swap_file(swap_addr_t slotIndex)
315{
316	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
317		swap_file* swapFile = it.Next();) {
318		if (slotIndex >= swapFile->first_slot
319			&& slotIndex < swapFile->last_slot) {
320			return swapFile;
321		}
322	}
323
324	panic("find_swap_file(): can't find swap file for slot %" B_PRIu32 "\n",
325		slotIndex);
326	return NULL;
327}
328
329
330static void
331swap_slot_dealloc(swap_addr_t slotIndex, uint32 count)
332{
333	if (slotIndex == SWAP_SLOT_NONE)
334		return;
335
336	mutex_lock(&sSwapFileListLock);
337	swap_file* swapFile = find_swap_file(slotIndex);
338	slotIndex -= swapFile->first_slot;
339	radix_bitmap_dealloc(swapFile->bmp, slotIndex, count);
340	mutex_unlock(&sSwapFileListLock);
341}
342
343
344static off_t
345swap_space_reserve(off_t amount)
346{
347	mutex_lock(&sAvailSwapSpaceLock);
348	if (sAvailSwapSpace >= amount)
349		sAvailSwapSpace -= amount;
350	else {
351		amount = sAvailSwapSpace;
352		sAvailSwapSpace = 0;
353	}
354	mutex_unlock(&sAvailSwapSpaceLock);
355
356	return amount;
357}
358
359
360static void
361swap_space_unreserve(off_t amount)
362{
363	mutex_lock(&sAvailSwapSpaceLock);
364	sAvailSwapSpace += amount;
365	mutex_unlock(&sAvailSwapSpaceLock);
366}
367
368
369static void
370swap_hash_resizer(void*, int)
371{
372	WriteLocker locker(sSwapHashLock);
373
374	size_t size;
375	void* allocation;
376
377	do {
378		size = sSwapHashTable.ResizeNeeded();
379		if (size == 0)
380			return;
381
382		locker.Unlock();
383
384		allocation = malloc(size);
385		if (allocation == NULL)
386			return;
387
388		locker.Lock();
389
390	} while (!sSwapHashTable.Resize(allocation, size));
391}
392
393
394// #pragma mark -
395
396
397class VMAnonymousCache::WriteCallback : public StackableAsyncIOCallback {
398public:
399	WriteCallback(VMAnonymousCache* cache, AsyncIOCallback* callback)
400		:
401		StackableAsyncIOCallback(callback),
402		fCache(cache)
403	{
404	}
405
406	void SetTo(page_num_t pageIndex, swap_addr_t slotIndex, bool newSlot)
407	{
408		fPageIndex = pageIndex;
409		fSlotIndex = slotIndex;
410		fNewSlot = newSlot;
411	}
412
413	virtual void IOFinished(status_t status, bool partialTransfer,
414		generic_size_t bytesTransferred)
415	{
416		if (fNewSlot) {
417			if (status == B_OK) {
418				fCache->_SwapBlockBuild(fPageIndex, fSlotIndex, 1);
419			} else {
420				AutoLocker<VMCache> locker(fCache);
421				fCache->fAllocatedSwapSize -= B_PAGE_SIZE;
422				locker.Unlock();
423
424				swap_slot_dealloc(fSlotIndex, 1);
425			}
426		}
427
428		fNextCallback->IOFinished(status, partialTransfer, bytesTransferred);
429
430		delete this;
431	}
432
433private:
434	VMAnonymousCache*	fCache;
435	page_num_t			fPageIndex;
436	swap_addr_t			fSlotIndex;
437	bool				fNewSlot;
438};
439
440
441// #pragma mark -
442
443
444VMAnonymousCache::~VMAnonymousCache()
445{
446	// free allocated swap space and swap block
447	for (off_t offset = virtual_base, toFree = fAllocatedSwapSize;
448		offset < virtual_end && toFree > 0; offset += B_PAGE_SIZE) {
449		swap_addr_t slotIndex = _SwapBlockGetAddress(offset >> PAGE_SHIFT);
450		if (slotIndex == SWAP_SLOT_NONE)
451			continue;
452
453		swap_slot_dealloc(slotIndex, 1);
454		_SwapBlockFree(offset >> PAGE_SHIFT, 1);
455		toFree -= B_PAGE_SIZE;
456	}
457
458	swap_space_unreserve(fCommittedSwapSize);
459	if (committed_size > fCommittedSwapSize)
460		vm_unreserve_memory(committed_size - fCommittedSwapSize);
461}
462
463
464status_t
465VMAnonymousCache::Init(bool canOvercommit, int32 numPrecommittedPages,
466	int32 numGuardPages, uint32 allocationFlags)
467{
468	TRACE("%p->VMAnonymousCache::Init(canOvercommit = %s, "
469		"numPrecommittedPages = %" B_PRId32 ", numGuardPages = %" B_PRId32
470		")\n", this, canOvercommit ? "yes" : "no", numPrecommittedPages,
471		numGuardPages);
472
473	status_t error = VMCache::Init(CACHE_TYPE_RAM, allocationFlags);
474	if (error != B_OK)
475		return error;
476
477	fCanOvercommit = canOvercommit;
478	fHasPrecommitted = false;
479	fPrecommittedPages = min_c(numPrecommittedPages, 255);
480	fGuardedSize = numGuardPages * B_PAGE_SIZE;
481	fCommittedSwapSize = 0;
482	fAllocatedSwapSize = 0;
483
484	return B_OK;
485}
486
487
488status_t
489VMAnonymousCache::Resize(off_t newSize, int priority)
490{
491	// If the cache size shrinks, drop all swap pages beyond the new size.
492	if (fAllocatedSwapSize > 0) {
493		page_num_t oldPageCount = (virtual_end + B_PAGE_SIZE - 1) >> PAGE_SHIFT;
494		swap_block* swapBlock = NULL;
495
496		for (page_num_t pageIndex = (newSize + B_PAGE_SIZE - 1) >> PAGE_SHIFT;
497			pageIndex < oldPageCount && fAllocatedSwapSize > 0; pageIndex++) {
498
499			WriteLocker locker(sSwapHashLock);
500
501			// Get the swap slot index for the page.
502			swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
503			if (swapBlock == NULL || blockIndex == 0) {
504				swap_hash_key key = { this, pageIndex };
505				swapBlock = sSwapHashTable.Lookup(key);
506
507				if (swapBlock == NULL) {
508					pageIndex = ROUNDUP(pageIndex + 1, SWAP_BLOCK_PAGES);
509					continue;
510				}
511			}
512
513			swap_addr_t slotIndex = swapBlock->swap_slots[blockIndex];
514			vm_page* page;
515			if (slotIndex != SWAP_SLOT_NONE
516				&& ((page = LookupPage((off_t)pageIndex * B_PAGE_SIZE)) == NULL
517					|| !page->busy)) {
518					// TODO: We skip (i.e. leak) swap space of busy pages, since
519					// there could be I/O going on (paging in/out). Waiting is
520					// not an option as 1. unlocking the cache means that new
521					// swap pages could be added in a range we've already
522					// cleared (since the cache still has the old size) and 2.
523					// we'd risk a deadlock in case we come from the file cache
524					// and the FS holds the node's write-lock. We should mark
525					// the page invalid and let the one responsible clean up.
526					// There's just no such mechanism yet.
527				swap_slot_dealloc(slotIndex, 1);
528				fAllocatedSwapSize -= B_PAGE_SIZE;
529
530				swapBlock->swap_slots[blockIndex] = SWAP_SLOT_NONE;
531				if (--swapBlock->used == 0) {
532					// All swap pages have been freed -- we can discard the swap
533					// block.
534					sSwapHashTable.RemoveUnchecked(swapBlock);
535					object_cache_free(sSwapBlockCache, swapBlock,
536						CACHE_DONT_WAIT_FOR_MEMORY
537							| CACHE_DONT_LOCK_KERNEL_SPACE);
538				}
539			}
540		}
541	}
542
543	return VMCache::Resize(newSize, priority);
544}
545
546
547status_t
548VMAnonymousCache::Commit(off_t size, int priority)
549{
550	TRACE("%p->VMAnonymousCache::Commit(%" B_PRIdOFF ")\n", this, size);
551
552	// If we can overcommit, we don't commit here, but in Fault(). We always
553	// unreserve memory, if we're asked to shrink our commitment, though.
554	if (fCanOvercommit && size > committed_size) {
555		if (fHasPrecommitted)
556			return B_OK;
557
558		// pre-commit some pages to make a later failure less probable
559		fHasPrecommitted = true;
560		uint32 precommitted = fPrecommittedPages * B_PAGE_SIZE;
561		if (size > precommitted)
562			size = precommitted;
563	}
564
565	return _Commit(size, priority);
566}
567
568
569bool
570VMAnonymousCache::HasPage(off_t offset)
571{
572	if (_SwapBlockGetAddress(offset >> PAGE_SHIFT) != SWAP_SLOT_NONE)
573		return true;
574
575	return false;
576}
577
578
579bool
580VMAnonymousCache::DebugHasPage(off_t offset)
581{
582	page_num_t pageIndex = offset >> PAGE_SHIFT;
583	swap_hash_key key = { this, pageIndex };
584	swap_block* swap = sSwapHashTable.Lookup(key);
585	if (swap == NULL)
586		return false;
587
588	return swap->swap_slots[pageIndex & SWAP_BLOCK_MASK] != SWAP_SLOT_NONE;
589}
590
591
592status_t
593VMAnonymousCache::Read(off_t offset, const generic_io_vec* vecs, size_t count,
594	uint32 flags, generic_size_t* _numBytes)
595{
596	off_t pageIndex = offset >> PAGE_SHIFT;
597
598	for (uint32 i = 0, j = 0; i < count; i = j) {
599		swap_addr_t startSlotIndex = _SwapBlockGetAddress(pageIndex + i);
600		for (j = i + 1; j < count; j++) {
601			swap_addr_t slotIndex = _SwapBlockGetAddress(pageIndex + j);
602			if (slotIndex != startSlotIndex + j - i)
603				break;
604		}
605
606		T(ReadPage(this, pageIndex, startSlotIndex));
607			// TODO: Assumes that only one page is read.
608
609		swap_file* swapFile = find_swap_file(startSlotIndex);
610
611		off_t pos = (off_t)(startSlotIndex - swapFile->first_slot)
612			* B_PAGE_SIZE;
613
614		status_t status = vfs_read_pages(swapFile->vnode, swapFile->cookie, pos,
615			vecs + i, j - i, flags, _numBytes);
616		if (status != B_OK)
617			return status;
618	}
619
620	return B_OK;
621}
622
623
624status_t
625VMAnonymousCache::Write(off_t offset, const generic_io_vec* vecs, size_t count,
626	uint32 flags, generic_size_t* _numBytes)
627{
628	off_t pageIndex = offset >> PAGE_SHIFT;
629
630	AutoLocker<VMCache> locker(this);
631
632	page_num_t totalPages = 0;
633	for (uint32 i = 0; i < count; i++) {
634		page_num_t pageCount = (vecs[i].length + B_PAGE_SIZE - 1) >> PAGE_SHIFT;
635		swap_addr_t slotIndex = _SwapBlockGetAddress(pageIndex + totalPages);
636		if (slotIndex != SWAP_SLOT_NONE) {
637			swap_slot_dealloc(slotIndex, pageCount);
638			_SwapBlockFree(pageIndex + totalPages, pageCount);
639			fAllocatedSwapSize -= pageCount * B_PAGE_SIZE;
640		}
641
642		totalPages += pageCount;
643	}
644
645	off_t totalSize = totalPages * B_PAGE_SIZE;
646	if (fAllocatedSwapSize + totalSize > fCommittedSwapSize)
647		return B_ERROR;
648
649	fAllocatedSwapSize += totalSize;
650	locker.Unlock();
651
652	page_num_t pagesLeft = totalPages;
653	totalPages = 0;
654
655	for (uint32 i = 0; i < count; i++) {
656		page_num_t pageCount = (vecs[i].length + B_PAGE_SIZE - 1) >> PAGE_SHIFT;
657
658		generic_addr_t vectorBase = vecs[i].base;
659		generic_size_t vectorLength = vecs[i].length;
660		page_num_t n = pageCount;
661
662		for (page_num_t j = 0; j < pageCount; j += n) {
663			swap_addr_t slotIndex;
664			// try to allocate n slots, if fail, try to allocate n/2
665			while ((slotIndex = swap_slot_alloc(n)) == SWAP_SLOT_NONE && n >= 2)
666				n >>= 1;
667
668			if (slotIndex == SWAP_SLOT_NONE)
669				panic("VMAnonymousCache::Write(): can't allocate swap space\n");
670
671			T(WritePage(this, pageIndex, slotIndex));
672				// TODO: Assumes that only one page is written.
673
674			swap_file* swapFile = find_swap_file(slotIndex);
675
676			off_t pos = (off_t)(slotIndex - swapFile->first_slot) * B_PAGE_SIZE;
677
678			generic_size_t length = (phys_addr_t)n * B_PAGE_SIZE;
679			generic_io_vec vector[1];
680			vector->base = vectorBase;
681			vector->length = length;
682
683			status_t status = vfs_write_pages(swapFile->vnode, swapFile->cookie,
684				pos, vector, 1, flags, &length);
685			if (status != B_OK) {
686				locker.Lock();
687				fAllocatedSwapSize -= (off_t)pagesLeft * B_PAGE_SIZE;
688				locker.Unlock();
689
690				swap_slot_dealloc(slotIndex, n);
691				return status;
692			}
693
694			_SwapBlockBuild(pageIndex + totalPages, slotIndex, n);
695			pagesLeft -= n;
696
697			if (n != pageCount) {
698				vectorBase = vectorBase + n * B_PAGE_SIZE;
699				vectorLength -= n * B_PAGE_SIZE;
700			}
701		}
702
703		totalPages += pageCount;
704	}
705
706	ASSERT(pagesLeft == 0);
707	return B_OK;
708}
709
710
711status_t
712VMAnonymousCache::WriteAsync(off_t offset, const generic_io_vec* vecs,
713	size_t count, generic_size_t numBytes, uint32 flags,
714	AsyncIOCallback* _callback)
715{
716	// TODO: Currently this method is only used for single pages. Either make
717	// more flexible use of it or change the interface!
718	// This implementation relies on the current usage!
719	ASSERT(count == 1);
720	ASSERT(numBytes <= B_PAGE_SIZE);
721
722	page_num_t pageIndex = offset >> PAGE_SHIFT;
723	swap_addr_t slotIndex = _SwapBlockGetAddress(pageIndex);
724	bool newSlot = slotIndex == SWAP_SLOT_NONE;
725
726	// If the page doesn't have any swap space yet, allocate it.
727	if (newSlot) {
728		AutoLocker<VMCache> locker(this);
729		if (fAllocatedSwapSize + B_PAGE_SIZE > fCommittedSwapSize) {
730			_callback->IOFinished(B_ERROR, true, 0);
731			return B_ERROR;
732		}
733
734		fAllocatedSwapSize += B_PAGE_SIZE;
735
736		slotIndex = swap_slot_alloc(1);
737	}
738
739	// create our callback
740	WriteCallback* callback = (flags & B_VIP_IO_REQUEST) != 0
741		? new(malloc_flags(HEAP_PRIORITY_VIP)) WriteCallback(this, _callback)
742		: new(std::nothrow) WriteCallback(this, _callback);
743	if (callback == NULL) {
744		if (newSlot) {
745			AutoLocker<VMCache> locker(this);
746			fAllocatedSwapSize -= B_PAGE_SIZE;
747			locker.Unlock();
748
749			swap_slot_dealloc(slotIndex, 1);
750		}
751		_callback->IOFinished(B_NO_MEMORY, true, 0);
752		return B_NO_MEMORY;
753	}
754	// TODO: If the page already had swap space assigned, we don't need an own
755	// callback.
756
757	callback->SetTo(pageIndex, slotIndex, newSlot);
758
759	T(WritePage(this, pageIndex, slotIndex));
760
761	// write the page asynchrounously
762	swap_file* swapFile = find_swap_file(slotIndex);
763	off_t pos = (off_t)(slotIndex - swapFile->first_slot) * B_PAGE_SIZE;
764
765	return vfs_asynchronous_write_pages(swapFile->vnode, swapFile->cookie, pos,
766		vecs, 1, numBytes, flags, callback);
767}
768
769
770bool
771VMAnonymousCache::CanWritePage(off_t offset)
772{
773	// We can write the page, if we have not used all of our committed swap
774	// space or the page already has a swap slot assigned.
775	return fAllocatedSwapSize < fCommittedSwapSize
776		|| _SwapBlockGetAddress(offset >> PAGE_SHIFT) != SWAP_SLOT_NONE;
777}
778
779
780int32
781VMAnonymousCache::MaxPagesPerAsyncWrite() const
782{
783	return 1;
784}
785
786
787status_t
788VMAnonymousCache::Fault(struct VMAddressSpace* aspace, off_t offset)
789{
790	if (fGuardedSize > 0) {
791		uint32 guardOffset;
792
793#ifdef STACK_GROWS_DOWNWARDS
794		guardOffset = 0;
795#elif defined(STACK_GROWS_UPWARDS)
796		guardOffset = virtual_size - fGuardedSize;
797#else
798#	error Stack direction has not been defined in arch_config.h
799#endif
800		// report stack fault, guard page hit!
801		if (offset >= guardOffset && offset < guardOffset + fGuardedSize) {
802			TRACE(("stack overflow!\n"));
803			return B_BAD_ADDRESS;
804		}
805	}
806
807	if (fCanOvercommit && LookupPage(offset) == NULL && !HasPage(offset)) {
808		if (fPrecommittedPages == 0) {
809			// never commit more than needed
810			if (committed_size / B_PAGE_SIZE > page_count)
811				return B_BAD_HANDLER;
812
813			// try to commit additional swap space/memory
814			if (swap_space_reserve(B_PAGE_SIZE) == B_PAGE_SIZE) {
815				fCommittedSwapSize += B_PAGE_SIZE;
816			} else {
817				int priority = aspace == VMAddressSpace::Kernel()
818					? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER;
819				if (vm_try_reserve_memory(B_PAGE_SIZE, priority, 0) != B_OK) {
820					dprintf("%p->VMAnonymousCache::Fault(): Failed to reserve "
821						"%d bytes of RAM.\n", this, (int)B_PAGE_SIZE);
822					return B_NO_MEMORY;
823				}
824			}
825
826			committed_size += B_PAGE_SIZE;
827		} else
828			fPrecommittedPages--;
829	}
830
831	// This will cause vm_soft_fault() to handle the fault
832	return B_BAD_HANDLER;
833}
834
835
836void
837VMAnonymousCache::Merge(VMCache* _source)
838{
839	VMAnonymousCache* source = dynamic_cast<VMAnonymousCache*>(_source);
840	if (source == NULL) {
841		panic("VMAnonymousCache::MergeStore(): merge with incompatible cache "
842			"%p requested", _source);
843		return;
844	}
845
846	// take over the source' committed size
847	fCommittedSwapSize += source->fCommittedSwapSize;
848	source->fCommittedSwapSize = 0;
849	committed_size += source->committed_size;
850	source->committed_size = 0;
851
852	off_t actualSize = virtual_end - virtual_base;
853	if (committed_size > actualSize)
854		_Commit(actualSize, VM_PRIORITY_USER);
855
856	// Move all not shadowed swap pages from the source to the consumer cache.
857	// Also remove all source pages that are shadowed by consumer swap pages.
858	_MergeSwapPages(source);
859
860	// Move all not shadowed pages from the source to the consumer cache.
861	if (source->page_count < page_count)
862		_MergePagesSmallerSource(source);
863	else
864		_MergePagesSmallerConsumer(source);
865}
866
867
868void
869VMAnonymousCache::DeleteObject()
870{
871	object_cache_delete(gAnonymousCacheObjectCache, this);
872}
873
874
875void
876VMAnonymousCache::_SwapBlockBuild(off_t startPageIndex,
877	swap_addr_t startSlotIndex, uint32 count)
878{
879	WriteLocker locker(sSwapHashLock);
880
881	uint32 left = count;
882	for (uint32 i = 0, j = 0; i < count; i += j) {
883		off_t pageIndex = startPageIndex + i;
884		swap_addr_t slotIndex = startSlotIndex + i;
885
886		swap_hash_key key = { this, pageIndex };
887
888		swap_block* swap = sSwapHashTable.Lookup(key);
889		while (swap == NULL) {
890			swap = (swap_block*)object_cache_alloc(sSwapBlockCache,
891				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
892			if (swap == NULL) {
893				// Wait a short time until memory is available again.
894				locker.Unlock();
895				snooze(10000);
896				locker.Lock();
897				swap = sSwapHashTable.Lookup(key);
898				continue;
899			}
900
901			swap->key.cache = this;
902			swap->key.page_index = pageIndex & ~(off_t)SWAP_BLOCK_MASK;
903			swap->used = 0;
904			for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++)
905				swap->swap_slots[i] = SWAP_SLOT_NONE;
906
907			sSwapHashTable.InsertUnchecked(swap);
908		}
909
910		swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
911		for (j = 0; blockIndex < SWAP_BLOCK_PAGES && left > 0; j++) {
912			swap->swap_slots[blockIndex++] = slotIndex + j;
913			left--;
914		}
915
916		swap->used += j;
917	}
918}
919
920
921void
922VMAnonymousCache::_SwapBlockFree(off_t startPageIndex, uint32 count)
923{
924	WriteLocker locker(sSwapHashLock);
925
926	uint32 left = count;
927	for (uint32 i = 0, j = 0; i < count; i += j) {
928		off_t pageIndex = startPageIndex + i;
929		swap_hash_key key = { this, pageIndex };
930		swap_block* swap = sSwapHashTable.Lookup(key);
931
932		ASSERT(swap != NULL);
933
934		swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
935		for (j = 0; blockIndex < SWAP_BLOCK_PAGES && left > 0; j++) {
936			swap->swap_slots[blockIndex++] = SWAP_SLOT_NONE;
937			left--;
938		}
939
940		swap->used -= j;
941		if (swap->used == 0) {
942			sSwapHashTable.RemoveUnchecked(swap);
943			object_cache_free(sSwapBlockCache, swap,
944				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
945		}
946	}
947}
948
949
950swap_addr_t
951VMAnonymousCache::_SwapBlockGetAddress(off_t pageIndex)
952{
953	ReadLocker locker(sSwapHashLock);
954
955	swap_hash_key key = { this, pageIndex };
956	swap_block* swap = sSwapHashTable.Lookup(key);
957	swap_addr_t slotIndex = SWAP_SLOT_NONE;
958
959	if (swap != NULL) {
960		swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
961		slotIndex = swap->swap_slots[blockIndex];
962	}
963
964	return slotIndex;
965}
966
967
968status_t
969VMAnonymousCache::_Commit(off_t size, int priority)
970{
971	TRACE("%p->VMAnonymousCache::_Commit(%" B_PRIdOFF "), already committed: "
972		"%" B_PRIdOFF " (%" B_PRIdOFF " swap)\n", this, size, committed_size,
973		fCommittedSwapSize);
974
975	// Basic strategy: reserve swap space first, only when running out of swap
976	// space, reserve real memory.
977
978	off_t committedMemory = committed_size - fCommittedSwapSize;
979
980	// Regardless of whether we're asked to grow or shrink the commitment,
981	// we always try to reserve as much as possible of the final commitment
982	// in the swap space.
983	if (size > fCommittedSwapSize) {
984		fCommittedSwapSize += swap_space_reserve(size - fCommittedSwapSize);
985		committed_size = fCommittedSwapSize + committedMemory;
986		if (size > fCommittedSwapSize) {
987			TRACE("%p->VMAnonymousCache::_Commit(%" B_PRIdOFF "), reserved "
988				"only %" B_PRIdOFF " swap\n", this, size, fCommittedSwapSize);
989		}
990	}
991
992	if (committed_size == size)
993		return B_OK;
994
995	if (committed_size > size) {
996		// The commitment shrinks -- unreserve real memory first.
997		off_t toUnreserve = committed_size - size;
998		if (committedMemory > 0) {
999			off_t unreserved = min_c(toUnreserve, committedMemory);
1000			vm_unreserve_memory(unreserved);
1001			committedMemory -= unreserved;
1002			committed_size -= unreserved;
1003			toUnreserve -= unreserved;
1004		}
1005
1006		// Unreserve swap space.
1007		if (toUnreserve > 0) {
1008			swap_space_unreserve(toUnreserve);
1009			fCommittedSwapSize -= toUnreserve;
1010			committed_size -= toUnreserve;
1011		}
1012
1013		return B_OK;
1014	}
1015
1016	// The commitment grows -- we have already tried to reserve swap space at
1017	// the start of the method, so we try to reserve real memory, now.
1018
1019	off_t toReserve = size - committed_size;
1020	if (vm_try_reserve_memory(toReserve, priority, 1000000) != B_OK) {
1021		dprintf("%p->VMAnonymousCache::_Commit(%" B_PRIdOFF "): Failed to "
1022			"reserve %" B_PRIdOFF " bytes of RAM\n", this, size, toReserve);
1023		return B_NO_MEMORY;
1024	}
1025
1026	committed_size = size;
1027	return B_OK;
1028}
1029
1030
1031void
1032VMAnonymousCache::_MergePagesSmallerSource(VMAnonymousCache* source)
1033{
1034	// The source cache has less pages than the consumer (this cache), so we
1035	// iterate through the source's pages and move the ones that are not
1036	// shadowed up to the consumer.
1037
1038	for (VMCachePagesTree::Iterator it = source->pages.GetIterator();
1039			vm_page* page = it.Next();) {
1040		// Note: Removing the current node while iterating through a
1041		// IteratableSplayTree is safe.
1042		vm_page* consumerPage = LookupPage(
1043			(off_t)page->cache_offset << PAGE_SHIFT);
1044		if (consumerPage == NULL) {
1045			// the page is not yet in the consumer cache - move it upwards
1046			ASSERT_PRINT(!page->busy, "page: %p", page);
1047			MovePage(page);
1048		}
1049	}
1050}
1051
1052
1053void
1054VMAnonymousCache::_MergePagesSmallerConsumer(VMAnonymousCache* source)
1055{
1056	// The consumer (this cache) has less pages than the source, so we move the
1057	// consumer's pages to the source (freeing shadowed ones) and finally just
1058	// all pages of the source back to the consumer.
1059
1060	for (VMCachePagesTree::Iterator it = pages.GetIterator();
1061		vm_page* page = it.Next();) {
1062		// If a source page is in the way, remove and free it.
1063		vm_page* sourcePage = source->LookupPage(
1064			(off_t)page->cache_offset << PAGE_SHIFT);
1065		if (sourcePage != NULL) {
1066			DEBUG_PAGE_ACCESS_START(sourcePage);
1067			ASSERT_PRINT(!sourcePage->busy, "page: %p", sourcePage);
1068			source->RemovePage(sourcePage);
1069			vm_page_free(source, sourcePage);
1070		}
1071
1072		// Note: Removing the current node while iterating through a
1073		// IteratableSplayTree is safe.
1074		source->MovePage(page);
1075	}
1076
1077	MoveAllPages(source);
1078}
1079
1080
1081void
1082VMAnonymousCache::_MergeSwapPages(VMAnonymousCache* source)
1083{
1084	// If neither source nor consumer have swap pages, we don't have to do
1085	// anything.
1086	if (source->fAllocatedSwapSize == 0 && fAllocatedSwapSize == 0)
1087		return;
1088
1089	for (off_t offset = source->virtual_base
1090		& ~(off_t)(B_PAGE_SIZE * SWAP_BLOCK_PAGES - 1);
1091		offset < source->virtual_end;
1092		offset += B_PAGE_SIZE * SWAP_BLOCK_PAGES) {
1093
1094		WriteLocker locker(sSwapHashLock);
1095
1096		page_num_t swapBlockPageIndex = offset >> PAGE_SHIFT;
1097		swap_hash_key key = { source, swapBlockPageIndex };
1098		swap_block* sourceSwapBlock = sSwapHashTable.Lookup(key);
1099
1100		// remove the source swap block -- we will either take over the swap
1101		// space (and the block) or free it
1102		if (sourceSwapBlock != NULL)
1103			sSwapHashTable.RemoveUnchecked(sourceSwapBlock);
1104
1105		key.cache = this;
1106		swap_block* swapBlock = sSwapHashTable.Lookup(key);
1107
1108		locker.Unlock();
1109
1110		// remove all source pages that are shadowed by consumer swap pages
1111		if (swapBlock != NULL) {
1112			for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++) {
1113				if (swapBlock->swap_slots[i] != SWAP_SLOT_NONE) {
1114					vm_page* page = source->LookupPage(
1115						(off_t)(swapBlockPageIndex + i) << PAGE_SHIFT);
1116					if (page != NULL) {
1117						DEBUG_PAGE_ACCESS_START(page);
1118						ASSERT_PRINT(!page->busy, "page: %p", page);
1119						source->RemovePage(page);
1120						vm_page_free(source, page);
1121					}
1122				}
1123			}
1124		}
1125
1126		if (sourceSwapBlock == NULL)
1127			continue;
1128
1129		for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++) {
1130			off_t pageIndex = swapBlockPageIndex + i;
1131			swap_addr_t sourceSlotIndex = sourceSwapBlock->swap_slots[i];
1132
1133			if (sourceSlotIndex == SWAP_SLOT_NONE)
1134				continue;
1135
1136			if ((swapBlock != NULL
1137					&& swapBlock->swap_slots[i] != SWAP_SLOT_NONE)
1138				|| LookupPage((off_t)pageIndex << PAGE_SHIFT) != NULL) {
1139				// The consumer already has a page or a swapped out page
1140				// at this index. So we can free the source swap space.
1141				swap_slot_dealloc(sourceSlotIndex, 1);
1142				sourceSwapBlock->swap_slots[i] = SWAP_SLOT_NONE;
1143				sourceSwapBlock->used--;
1144			}
1145
1146			// We've either freed the source swap page or are going to move it
1147			// to the consumer. At any rate, the source cache doesn't own it
1148			// anymore.
1149			source->fAllocatedSwapSize -= B_PAGE_SIZE;
1150		}
1151
1152		// All source swap pages that have not been freed yet are taken over by
1153		// the consumer.
1154		fAllocatedSwapSize += B_PAGE_SIZE * (off_t)sourceSwapBlock->used;
1155
1156		if (sourceSwapBlock->used == 0) {
1157			// All swap pages have been freed -- we can discard the source swap
1158			// block.
1159			object_cache_free(sSwapBlockCache, sourceSwapBlock,
1160				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
1161		} else if (swapBlock == NULL) {
1162			// We need to take over some of the source's swap pages and there's
1163			// no swap block in the consumer cache. Just take over the source
1164			// swap block.
1165			sourceSwapBlock->key.cache = this;
1166			locker.Lock();
1167			sSwapHashTable.InsertUnchecked(sourceSwapBlock);
1168			locker.Unlock();
1169		} else {
1170			// We need to take over some of the source's swap pages and there's
1171			// already a swap block in the consumer cache. Copy the respective
1172			// swap addresses and discard the source swap block.
1173			for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++) {
1174				if (sourceSwapBlock->swap_slots[i] != SWAP_SLOT_NONE)
1175					swapBlock->swap_slots[i] = sourceSwapBlock->swap_slots[i];
1176			}
1177
1178			object_cache_free(sSwapBlockCache, sourceSwapBlock,
1179				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
1180		}
1181	}
1182}
1183
1184
1185// #pragma mark -
1186
1187
1188// TODO: This can be removed if we get BFS uuid's
1189struct VolumeInfo {
1190	char name[B_FILE_NAME_LENGTH];
1191	char device[B_FILE_NAME_LENGTH];
1192	char filesystem[B_OS_NAME_LENGTH];
1193	off_t capacity;
1194};
1195
1196
1197class PartitionScorer : public KPartitionVisitor {
1198public:
1199	PartitionScorer(VolumeInfo& volumeInfo)
1200		:
1201		fBestPartition(NULL),
1202		fBestScore(-1),
1203		fVolumeInfo(volumeInfo)
1204	{
1205	}
1206
1207	virtual bool VisitPre(KPartition* partition)
1208	{
1209		if (!partition->ContainsFileSystem())
1210			return false;
1211
1212		KPath path;
1213		partition->GetPath(&path);
1214
1215		int score = 0;
1216		if (strcmp(fVolumeInfo.name, partition->ContentName()) == 0)
1217			score += 4;
1218		if (strcmp(fVolumeInfo.device, path.Path()) == 0)
1219			score += 3;
1220		if (fVolumeInfo.capacity == partition->Size())
1221			score += 2;
1222		if (strcmp(fVolumeInfo.filesystem,
1223			partition->DiskSystem()->ShortName()) == 0) {
1224			score += 1;
1225		}
1226		if (score >= 4 && score > fBestScore) {
1227			fBestPartition = partition;
1228			fBestScore = score;
1229		}
1230
1231		return false;
1232	}
1233
1234	KPartition* fBestPartition;
1235
1236private:
1237	int32		fBestScore;
1238	VolumeInfo	fVolumeInfo;
1239};
1240
1241
1242status_t
1243get_mount_point(KPartition* partition, KPath* mountPoint)
1244{
1245	if (!mountPoint || !partition->ContainsFileSystem())
1246		return B_BAD_VALUE;
1247
1248	const char* volumeName = partition->ContentName();
1249	if (!volumeName || strlen(volumeName) == 0)
1250		volumeName = partition->Name();
1251	if (!volumeName || strlen(volumeName) == 0)
1252		volumeName = "unnamed volume";
1253
1254	char basePath[B_PATH_NAME_LENGTH];
1255	int32 len = snprintf(basePath, sizeof(basePath), "/%s", volumeName);
1256	for (int32 i = 1; i < len; i++)
1257		if (basePath[i] == '/')
1258		basePath[i] = '-';
1259	char* path = mountPoint->LockBuffer();
1260	int32 pathLen = mountPoint->BufferSize();
1261	strncpy(path, basePath, pathLen);
1262
1263	struct stat dummy;
1264	for (int i = 1; ; i++) {
1265		if (stat(path, &dummy) != 0)
1266			break;
1267		snprintf(path, pathLen, "%s%d", basePath, i);
1268	}
1269
1270	mountPoint->UnlockBuffer();
1271	return B_OK;
1272}
1273
1274
1275status_t
1276swap_file_add(const char* path)
1277{
1278	// open the file
1279	int fd = open(path, O_RDWR | O_NOCACHE, S_IRUSR | S_IWUSR);
1280	if (fd < 0)
1281		return errno;
1282
1283	// fstat() it and check whether we can use it
1284	struct stat st;
1285	if (fstat(fd, &st) < 0) {
1286		close(fd);
1287		return errno;
1288	}
1289
1290	if (!(S_ISREG(st.st_mode) || S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
1291		close(fd);
1292		return B_BAD_VALUE;
1293	}
1294
1295	if (st.st_size < B_PAGE_SIZE) {
1296		close(fd);
1297		return B_BAD_VALUE;
1298	}
1299
1300	// get file descriptor, vnode, and cookie
1301	file_descriptor* descriptor = get_fd(get_current_io_context(true), fd);
1302	put_fd(descriptor);
1303
1304	vnode* node = fd_vnode(descriptor);
1305	if (node == NULL) {
1306		close(fd);
1307		return B_BAD_VALUE;
1308	}
1309
1310	// do the allocations and prepare the swap_file structure
1311	swap_file* swap = (swap_file*)malloc(sizeof(swap_file));
1312	if (swap == NULL) {
1313		close(fd);
1314		return B_NO_MEMORY;
1315	}
1316
1317	swap->fd = fd;
1318	swap->vnode = node;
1319	swap->cookie = descriptor->cookie;
1320
1321	uint32 pageCount = st.st_size >> PAGE_SHIFT;
1322	swap->bmp = radix_bitmap_create(pageCount);
1323	if (swap->bmp == NULL) {
1324		free(swap);
1325		close(fd);
1326		return B_NO_MEMORY;
1327	}
1328
1329	// set slot index and add this file to swap file list
1330	mutex_lock(&sSwapFileListLock);
1331	// TODO: Also check whether the swap file is already registered!
1332	if (sSwapFileList.IsEmpty()) {
1333		swap->first_slot = 0;
1334		swap->last_slot = pageCount;
1335	} else {
1336		// leave one page gap between two swap files
1337		swap->first_slot = sSwapFileList.Last()->last_slot + 1;
1338		swap->last_slot = swap->first_slot + pageCount;
1339	}
1340	sSwapFileList.Add(swap);
1341	sSwapFileCount++;
1342	mutex_unlock(&sSwapFileListLock);
1343
1344	mutex_lock(&sAvailSwapSpaceLock);
1345	sAvailSwapSpace += (off_t)pageCount * B_PAGE_SIZE;
1346	mutex_unlock(&sAvailSwapSpaceLock);
1347
1348	return B_OK;
1349}
1350
1351
1352status_t
1353swap_file_delete(const char* path)
1354{
1355	vnode* node = NULL;
1356	status_t status = vfs_get_vnode_from_path(path, true, &node);
1357	if (status != B_OK)
1358		return status;
1359
1360	MutexLocker locker(sSwapFileListLock);
1361
1362	swap_file* swapFile = NULL;
1363	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
1364			(swapFile = it.Next()) != NULL;) {
1365		if (swapFile->vnode == node)
1366			break;
1367	}
1368
1369	vfs_put_vnode(node);
1370
1371	if (swapFile == NULL)
1372		return B_ERROR;
1373
1374	// if this file is currently used, we can't delete
1375	// TODO: mark this swap file deleting, and remove it after releasing
1376	// all the swap space
1377	if (swapFile->bmp->free_slots < swapFile->last_slot - swapFile->first_slot)
1378		return B_ERROR;
1379
1380	sSwapFileList.Remove(swapFile);
1381	sSwapFileCount--;
1382	locker.Unlock();
1383
1384	mutex_lock(&sAvailSwapSpaceLock);
1385	sAvailSwapSpace -= (off_t)(swapFile->last_slot - swapFile->first_slot)
1386		* PAGE_SIZE;
1387	mutex_unlock(&sAvailSwapSpaceLock);
1388
1389	close(swapFile->fd);
1390	radix_bitmap_destroy(swapFile->bmp);
1391	free(swapFile);
1392
1393	return B_OK;
1394}
1395
1396
1397void
1398swap_init(void)
1399{
1400	// create swap block cache
1401	sSwapBlockCache = create_object_cache("swapblock", sizeof(swap_block),
1402		sizeof(void*), NULL, NULL, NULL);
1403	if (sSwapBlockCache == NULL)
1404		panic("swap_init(): can't create object cache for swap blocks\n");
1405
1406	status_t error = object_cache_set_minimum_reserve(sSwapBlockCache,
1407		MIN_SWAP_BLOCK_RESERVE);
1408	if (error != B_OK) {
1409		panic("swap_init(): object_cache_set_minimum_reserve() failed: %s",
1410			strerror(error));
1411	}
1412
1413	// init swap hash table
1414	sSwapHashTable.Init(INITIAL_SWAP_HASH_SIZE);
1415	rw_lock_init(&sSwapHashLock, "swaphash");
1416
1417	error = register_resource_resizer(swap_hash_resizer, NULL,
1418		SWAP_HASH_RESIZE_INTERVAL);
1419	if (error != B_OK) {
1420		panic("swap_init(): Failed to register swap hash resizer: %s",
1421			strerror(error));
1422	}
1423
1424	// init swap file list
1425	mutex_init(&sSwapFileListLock, "swaplist");
1426	sSwapFileAlloc = NULL;
1427	sSwapFileCount = 0;
1428
1429	// init available swap space
1430	mutex_init(&sAvailSwapSpaceLock, "avail swap space");
1431	sAvailSwapSpace = 0;
1432
1433	add_debugger_command_etc("swap", &dump_swap_info,
1434		"Print infos about the swap usage",
1435		"\n"
1436		"Print infos about the swap usage.\n", 0);
1437}
1438
1439
1440void
1441swap_init_post_modules()
1442{
1443	// Never try to create a swap file on a read-only device - when booting
1444	// from CD, the write overlay is used.
1445	if (gReadOnlyBootDevice)
1446		return;
1447
1448	bool swapEnabled = true;
1449	bool swapAutomatic = true;
1450	off_t swapSize = 0;
1451
1452	dev_t swapDeviceID = -1;
1453	VolumeInfo selectedVolume = {};
1454
1455	void* settings = load_driver_settings("virtual_memory");
1456
1457	if (settings != NULL) {
1458		// We pass a lot of information on the swap device, this is mostly to
1459		// ensure that we are dealing with the same device that was configured.
1460
1461		// TODO: Some kind of BFS uuid would be great here :)
1462		const char* enabled = get_driver_parameter(settings, "vm", NULL, NULL);
1463
1464		if (enabled != NULL) {
1465			swapEnabled = get_driver_boolean_parameter(settings, "vm",
1466				true, false);
1467			swapAutomatic = get_driver_boolean_parameter(settings, "swap_auto",
1468				true, false);
1469
1470			if (swapEnabled && !swapAutomatic) {
1471				const char* size = get_driver_parameter(settings, "swap_size",
1472					NULL, NULL);
1473				const char* volume = get_driver_parameter(settings,
1474					"swap_volume_name", NULL, NULL);
1475				const char* device = get_driver_parameter(settings,
1476					"swap_volume_device", NULL, NULL);
1477				const char* filesystem = get_driver_parameter(settings,
1478					"swap_volume_filesystem", NULL, NULL);
1479				const char* capacity = get_driver_parameter(settings,
1480					"swap_volume_capacity", NULL, NULL);
1481
1482				if (size != NULL && device != NULL && volume != NULL
1483					&& filesystem != NULL && capacity != NULL) {
1484					// User specified a size / volume that seems valid
1485					swapAutomatic = false;
1486					swapSize = atoll(size);
1487					strncpy(selectedVolume.name, volume,
1488						sizeof(selectedVolume.name));
1489					strncpy(selectedVolume.device, device,
1490						sizeof(selectedVolume.device));
1491					strncpy(selectedVolume.filesystem, filesystem,
1492						sizeof(selectedVolume.filesystem));
1493					selectedVolume.capacity = atoll(capacity);
1494				} else {
1495					// Something isn't right with swap config, go auto
1496					swapAutomatic = true;
1497					dprintf("%s: virtual_memory configuration is invalid, "
1498						"using automatic swap\n", __func__);
1499				}
1500			}
1501			unload_driver_settings(settings);
1502		}
1503	}
1504
1505	if (swapAutomatic) {
1506		swapSize = (off_t)vm_page_num_pages() * B_PAGE_SIZE;
1507		if (swapSize <= (1024 * 1024 * 1024)) {
1508			// Memory under 1GB? double the swap
1509			swapSize *= 2;
1510		}
1511		// Automatic swap defaults to the boot device
1512		swapDeviceID = gBootDevice;
1513	}
1514
1515	if (!swapEnabled || swapSize < B_PAGE_SIZE) {
1516		dprintf("%s: virtual_memory is disabled\n", __func__);
1517		return;
1518	}
1519
1520	if (!swapAutomatic && swapDeviceID < 0) {
1521		// If user-specified swap, and no swap device has been chosen yet...
1522		KDiskDeviceManager::CreateDefault();
1523		KDiskDeviceManager* manager = KDiskDeviceManager::Default();
1524		PartitionScorer visitor(selectedVolume);
1525
1526		KDiskDevice* device;
1527		int32 cookie = 0;
1528		while ((device = manager->NextDevice(&cookie)) != NULL) {
1529			if (device->IsReadOnlyMedia() || device->IsWriteOnce()
1530				|| device->IsRemovable()) {
1531				continue;
1532			}
1533			device->VisitEachDescendant(&visitor);
1534		}
1535
1536		if (!visitor.fBestPartition) {
1537			dprintf("%s: Can't find configured swap partition '%s'\n",
1538				__func__, selectedVolume.name);
1539		} else {
1540			if (visitor.fBestPartition->IsMounted())
1541				swapDeviceID = visitor.fBestPartition->VolumeID();
1542			else {
1543				KPath devPath, mountPoint;
1544				visitor.fBestPartition->GetPath(&devPath);
1545				get_mount_point(visitor.fBestPartition, &mountPoint);
1546				const char* mountPath = mountPoint.Path();
1547				mkdir(mountPath, S_IRWXU | S_IRWXG | S_IRWXO);
1548				swapDeviceID = _kern_mount(mountPath, devPath.Path(),
1549					NULL, 0, NULL, 0);
1550				if (swapDeviceID < 0) {
1551					dprintf("%s: Can't mount configured swap partition '%s'\n",
1552						__func__, selectedVolume.name);
1553				}
1554			}
1555		}
1556	}
1557
1558	if (swapDeviceID < 0)
1559		swapDeviceID = gBootDevice;
1560
1561	// We now have a swapDeviceID which is used for the swap file
1562
1563	KPath path;
1564	struct fs_info info;
1565	_kern_read_fs_info(swapDeviceID, &info);
1566	if (swapDeviceID == gBootDevice)
1567		path = kDefaultSwapPath;
1568	else {
1569		vfs_entry_ref_to_path(info.dev, info.root,
1570			".", path.LockBuffer(), path.BufferSize());
1571		path.UnlockBuffer();
1572		path.Append("swap");
1573	}
1574
1575	const char* swapPath = path.Path();
1576
1577	// Swap size limits prevent oversized swap files
1578	if (swapAutomatic) {
1579		off_t existingSwapSize = 0;
1580		struct stat existingSwapStat;
1581		if (stat(swapPath, &existingSwapStat) == 0)
1582			existingSwapSize = existingSwapStat.st_size;
1583
1584		off_t freeSpace = info.free_blocks * info.block_size + existingSwapSize;
1585
1586		// Adjust automatic swap to a maximum of 25% of the free space
1587		if (swapSize > (freeSpace / 4))
1588			swapSize = (freeSpace / 4);
1589	}
1590
1591	// Create swap file
1592	int fd = open(swapPath, O_RDWR | O_CREAT | O_NOCACHE, S_IRUSR | S_IWUSR);
1593	if (fd < 0) {
1594		dprintf("%s: Can't open/create %s: %s\n", __func__,
1595			swapPath, strerror(errno));
1596		return;
1597	}
1598
1599	struct stat stat;
1600	stat.st_size = swapSize;
1601	status_t error = _kern_write_stat(fd, NULL, false, &stat,
1602		sizeof(struct stat), B_STAT_SIZE | B_STAT_SIZE_INSECURE);
1603	if (error != B_OK) {
1604		dprintf("%s: Failed to resize %s to %" B_PRIdOFF " bytes: %s\n",
1605			__func__, swapPath, swapSize, strerror(error));
1606	}
1607
1608	close(fd);
1609
1610	error = swap_file_add(swapPath);
1611	if (error != B_OK) {
1612		dprintf("%s: Failed to add swap file %s: %s\n", __func__, swapPath,
1613			strerror(error));
1614	}
1615}
1616
1617
1618//! Used by page daemon to free swap space.
1619bool
1620swap_free_page_swap_space(vm_page* page)
1621{
1622	VMAnonymousCache* cache = dynamic_cast<VMAnonymousCache*>(page->Cache());
1623	if (cache == NULL)
1624		return false;
1625
1626	swap_addr_t slotIndex = cache->_SwapBlockGetAddress(page->cache_offset);
1627	if (slotIndex == SWAP_SLOT_NONE)
1628		return false;
1629
1630	swap_slot_dealloc(slotIndex, 1);
1631	cache->fAllocatedSwapSize -= B_PAGE_SIZE;
1632	cache->_SwapBlockFree(page->cache_offset, 1);
1633
1634	return true;
1635}
1636
1637
1638uint32
1639swap_available_pages()
1640{
1641	mutex_lock(&sAvailSwapSpaceLock);
1642	uint32 avail = sAvailSwapSpace >> PAGE_SHIFT;
1643	mutex_unlock(&sAvailSwapSpaceLock);
1644
1645	return avail;
1646}
1647
1648
1649uint32
1650swap_total_swap_pages()
1651{
1652	mutex_lock(&sSwapFileListLock);
1653
1654	uint32 totalSwapSlots = 0;
1655	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
1656		swap_file* swapFile = it.Next();) {
1657		totalSwapSlots += swapFile->last_slot - swapFile->first_slot;
1658	}
1659
1660	mutex_unlock(&sSwapFileListLock);
1661
1662	return totalSwapSlots;
1663}
1664
1665
1666#endif	// ENABLE_SWAP_SUPPORT
1667
1668
1669void
1670swap_get_info(struct system_memory_info* info)
1671{
1672#if ENABLE_SWAP_SUPPORT
1673	info->max_swap_space = (uint64)swap_total_swap_pages() * B_PAGE_SIZE;
1674	info->free_swap_space = (uint64)swap_available_pages() * B_PAGE_SIZE;
1675#else
1676	info->max_swap_space = 0;
1677	info->free_swap_space = 0;
1678#endif
1679}
1680
1681