1/*
2 * Copyright 2004-2009, Axel D��rfler, axeld@pinc-software.de.
3 * Distributed under the terms of the MIT License.
4 */
5
6
7#include "vnode_store.h"
8
9#include <unistd.h>
10#include <stdlib.h>
11#include <string.h>
12
13#include <KernelExport.h>
14#include <fs_cache.h>
15
16#include <condition_variable.h>
17#include <file_cache.h>
18#include <generic_syscall.h>
19#include <low_resource_manager.h>
20#include <thread.h>
21#include <util/AutoLock.h>
22#include <util/kernel_cpp.h>
23#include <vfs.h>
24#include <vm/vm.h>
25#include <vm/vm_page.h>
26#include <vm/VMCache.h>
27
28#include "IORequest.h"
29
30
31//#define TRACE_FILE_CACHE
32#ifdef TRACE_FILE_CACHE
33#	define TRACE(x) dprintf x
34#else
35#	define TRACE(x) ;
36#endif
37
38// maximum number of iovecs per request
39#define MAX_IO_VECS			32	// 128 kB
40#define MAX_FILE_IO_VECS	32
41
42#define BYPASS_IO_SIZE		65536
43#define LAST_ACCESSES		3
44
45struct file_cache_ref {
46	VMCache			*cache;
47	struct vnode	*vnode;
48	off_t			last_access[LAST_ACCESSES];
49		// TODO: it would probably be enough to only store the least
50		//	significant 31 bits, and make this uint32 (one bit for
51		//	write vs. read)
52	int32			last_access_index;
53	uint16			disabled_count;
54
55	inline void SetLastAccess(int32 index, off_t access, bool isWrite)
56	{
57		// we remember writes as negative offsets
58		last_access[index] = isWrite ? -access : access;
59	}
60
61	inline off_t LastAccess(int32 index, bool isWrite) const
62	{
63		return isWrite ? -last_access[index] : last_access[index];
64	}
65
66	inline uint32 LastAccessPageOffset(int32 index, bool isWrite)
67	{
68		return LastAccess(index, isWrite) >> PAGE_SHIFT;
69	}
70};
71
72class PrecacheIO : public AsyncIOCallback {
73public:
74								PrecacheIO(file_cache_ref* ref, off_t offset,
75									generic_size_t size);
76								~PrecacheIO();
77
78			status_t			Prepare(vm_page_reservation* reservation);
79			void				ReadAsync();
80
81	virtual	void				IOFinished(status_t status,
82									bool partialTransfer,
83									generic_size_t bytesTransferred);
84
85private:
86			file_cache_ref*		fRef;
87			VMCache*			fCache;
88			vm_page**			fPages;
89			size_t				fPageCount;
90			ConditionVariable*	fBusyConditions;
91			generic_io_vec*		fVecs;
92			off_t				fOffset;
93			uint32				fVecCount;
94			generic_size_t		fSize;
95#if DEBUG_PAGE_ACCESS
96			thread_id			fAllocatingThread;
97#endif
98};
99
100typedef status_t (*cache_func)(file_cache_ref* ref, void* cookie, off_t offset,
101	int32 pageOffset, addr_t buffer, size_t bufferSize, bool useBuffer,
102	vm_page_reservation* reservation, size_t reservePages);
103
104static void add_to_iovec(generic_io_vec* vecs, uint32 &index, uint32 max,
105	generic_addr_t address, generic_size_t size);
106
107
108static struct cache_module_info* sCacheModule;
109
110
111static const uint32 kZeroVecCount = 32;
112static const size_t kZeroVecSize = kZeroVecCount * B_PAGE_SIZE;
113static phys_addr_t sZeroPage;	// physical address
114static generic_io_vec sZeroVecs[kZeroVecCount];
115
116
117//	#pragma mark -
118
119
120PrecacheIO::PrecacheIO(file_cache_ref* ref, off_t offset, generic_size_t size)
121	:
122	fRef(ref),
123	fCache(ref->cache),
124	fPages(NULL),
125	fVecs(NULL),
126	fOffset(offset),
127	fVecCount(0),
128	fSize(size)
129{
130	fPageCount = (size + B_PAGE_SIZE - 1) / B_PAGE_SIZE;
131	fCache->AcquireRefLocked();
132}
133
134
135PrecacheIO::~PrecacheIO()
136{
137	delete[] fPages;
138	delete[] fVecs;
139	fCache->ReleaseRefLocked();
140}
141
142
143status_t
144PrecacheIO::Prepare(vm_page_reservation* reservation)
145{
146	if (fPageCount == 0)
147		return B_BAD_VALUE;
148
149	fPages = new(std::nothrow) vm_page*[fPageCount];
150	if (fPages == NULL)
151		return B_NO_MEMORY;
152
153	fVecs = new(std::nothrow) generic_io_vec[fPageCount];
154	if (fVecs == NULL)
155		return B_NO_MEMORY;
156
157	// allocate pages for the cache and mark them busy
158	uint32 i = 0;
159	for (generic_size_t pos = 0; pos < fSize; pos += B_PAGE_SIZE) {
160		vm_page* page = vm_page_allocate_page(reservation,
161			PAGE_STATE_CACHED | VM_PAGE_ALLOC_BUSY);
162
163		fCache->InsertPage(page, fOffset + pos);
164
165		add_to_iovec(fVecs, fVecCount, fPageCount,
166			page->physical_page_number * B_PAGE_SIZE, B_PAGE_SIZE);
167		fPages[i++] = page;
168	}
169
170#if DEBUG_PAGE_ACCESS
171	fAllocatingThread = find_thread(NULL);
172#endif
173
174	return B_OK;
175}
176
177
178void
179PrecacheIO::ReadAsync()
180{
181	// This object is going to be deleted after the I/O request has been
182	// fulfilled
183	vfs_asynchronous_read_pages(fRef->vnode, NULL, fOffset, fVecs, fVecCount,
184		fSize, B_PHYSICAL_IO_REQUEST, this);
185}
186
187
188void
189PrecacheIO::IOFinished(status_t status, bool partialTransfer,
190	generic_size_t bytesTransferred)
191{
192	AutoLocker<VMCache> locker(fCache);
193
194	// Make successfully loaded pages accessible again (partially
195	// transferred pages are considered failed)
196	phys_size_t pagesTransferred
197		= (bytesTransferred + B_PAGE_SIZE - 1) / B_PAGE_SIZE;
198
199	if (fOffset + (off_t)bytesTransferred > fCache->virtual_end)
200		bytesTransferred = fCache->virtual_end - fOffset;
201
202	for (uint32 i = 0; i < pagesTransferred; i++) {
203		if (i == pagesTransferred - 1
204			&& (bytesTransferred % B_PAGE_SIZE) != 0) {
205			// clear partial page
206			size_t bytesTouched = bytesTransferred % B_PAGE_SIZE;
207			vm_memset_physical(
208				((phys_addr_t)fPages[i]->physical_page_number << PAGE_SHIFT)
209					+ bytesTouched,
210				0, B_PAGE_SIZE - bytesTouched);
211		}
212
213		DEBUG_PAGE_ACCESS_TRANSFER(fPages[i], fAllocatingThread);
214
215		fCache->MarkPageUnbusy(fPages[i]);
216
217		DEBUG_PAGE_ACCESS_END(fPages[i]);
218	}
219
220	// Free pages after failed I/O
221	for (uint32 i = pagesTransferred; i < fPageCount; i++) {
222		DEBUG_PAGE_ACCESS_TRANSFER(fPages[i], fAllocatingThread);
223		fCache->NotifyPageEvents(fPages[i], PAGE_EVENT_NOT_BUSY);
224		fCache->RemovePage(fPages[i]);
225		vm_page_set_state(fPages[i], PAGE_STATE_FREE);
226	}
227
228	delete this;
229}
230
231
232//	#pragma mark -
233
234
235static void
236add_to_iovec(generic_io_vec* vecs, uint32 &index, uint32 max,
237	generic_addr_t address, generic_size_t size)
238{
239	if (index > 0 && vecs[index - 1].base + vecs[index - 1].length == address) {
240		// the iovec can be combined with the previous one
241		vecs[index - 1].length += size;
242		return;
243	}
244
245	if (index == max)
246		panic("no more space for iovecs!");
247
248	// we need to start a new iovec
249	vecs[index].base = address;
250	vecs[index].length = size;
251	index++;
252}
253
254
255static inline bool
256access_is_sequential(file_cache_ref* ref)
257{
258	return ref->last_access[ref->last_access_index] != 0;
259}
260
261
262static inline void
263push_access(file_cache_ref* ref, off_t offset, generic_size_t bytes,
264	bool isWrite)
265{
266	TRACE(("%p: push %Ld, %ld, %s\n", ref, offset, bytes,
267		isWrite ? "write" : "read"));
268
269	int32 index = ref->last_access_index;
270	int32 previous = index - 1;
271	if (previous < 0)
272		previous = LAST_ACCESSES - 1;
273
274	if (offset != ref->LastAccess(previous, isWrite))
275		ref->last_access[previous] = 0;
276
277	ref->SetLastAccess(index, offset + bytes, isWrite);
278
279	if (++index >= LAST_ACCESSES)
280		index = 0;
281	ref->last_access_index = index;
282}
283
284
285static void
286reserve_pages(file_cache_ref* ref, vm_page_reservation* reservation,
287	size_t reservePages, bool isWrite)
288{
289	if (low_resource_state(B_KERNEL_RESOURCE_PAGES) != B_NO_LOW_RESOURCE) {
290		VMCache* cache = ref->cache;
291		cache->Lock();
292
293		if (cache->consumers.IsEmpty() && cache->areas == NULL
294			&& access_is_sequential(ref)) {
295			// we are not mapped, and we're accessed sequentially
296
297			if (isWrite) {
298				// Just write some pages back, and actually wait until they
299				// have been written back in order to relieve the page pressure
300				// a bit.
301				int32 index = ref->last_access_index;
302				int32 previous = index - 1;
303				if (previous < 0)
304					previous = LAST_ACCESSES - 1;
305
306				vm_page_write_modified_page_range(cache,
307					ref->LastAccessPageOffset(previous, true),
308					ref->LastAccessPageOffset(index, true));
309			} else {
310				// free some pages from our cache
311				// TODO: start with oldest
312				uint32 left = reservePages;
313				vm_page* page;
314				for (VMCachePagesTree::Iterator it = cache->pages.GetIterator();
315						(page = it.Next()) != NULL && left > 0;) {
316					if (page->State() == PAGE_STATE_CACHED && !page->busy) {
317						DEBUG_PAGE_ACCESS_START(page);
318						ASSERT(!page->IsMapped());
319						ASSERT(!page->modified);
320						cache->RemovePage(page);
321						vm_page_set_state(page, PAGE_STATE_FREE);
322						left--;
323					}
324				}
325			}
326		}
327		cache->Unlock();
328	}
329
330	vm_page_reserve_pages(reservation, reservePages, VM_PRIORITY_USER);
331}
332
333
334static inline status_t
335read_pages_and_clear_partial(file_cache_ref* ref, void* cookie, off_t offset,
336	const generic_io_vec* vecs, size_t count, uint32 flags,
337	generic_size_t* _numBytes)
338{
339	generic_size_t bytesUntouched = *_numBytes;
340
341	status_t status = vfs_read_pages(ref->vnode, cookie, offset, vecs, count,
342		flags, _numBytes);
343
344	generic_size_t bytesEnd = *_numBytes;
345
346	if (offset + (off_t)bytesEnd > ref->cache->virtual_end)
347		bytesEnd = ref->cache->virtual_end - offset;
348
349	if (status == B_OK && bytesEnd < bytesUntouched) {
350		// Clear out any leftovers that were not touched by the above read.
351		// We're doing this here so that not every file system/device has to
352		// implement this.
353		bytesUntouched -= bytesEnd;
354
355		for (int32 i = count; i-- > 0 && bytesUntouched != 0; ) {
356			generic_size_t length = min_c(bytesUntouched, vecs[i].length);
357			vm_memset_physical(vecs[i].base + vecs[i].length - length, 0,
358				length);
359
360			bytesUntouched -= length;
361		}
362	}
363
364	return status;
365}
366
367
368/*!	Reads the requested amount of data into the cache, and allocates
369	pages needed to fulfill that request. This function is called by cache_io().
370	It can only handle a certain amount of bytes, and the caller must make
371	sure that it matches that criterion.
372	The cache_ref lock must be held when calling this function; during
373	operation it will unlock the cache, though.
374*/
375static status_t
376read_into_cache(file_cache_ref* ref, void* cookie, off_t offset,
377	int32 pageOffset, addr_t buffer, size_t bufferSize, bool useBuffer,
378	vm_page_reservation* reservation, size_t reservePages)
379{
380	TRACE(("read_into_cache(offset = %Ld, pageOffset = %ld, buffer = %#lx, "
381		"bufferSize = %lu\n", offset, pageOffset, buffer, bufferSize));
382
383	VMCache* cache = ref->cache;
384
385	// TODO: We're using way too much stack! Rather allocate a sufficiently
386	// large chunk on the heap.
387	generic_io_vec vecs[MAX_IO_VECS];
388	uint32 vecCount = 0;
389
390	generic_size_t numBytes = PAGE_ALIGN(pageOffset + bufferSize);
391	vm_page* pages[MAX_IO_VECS];
392	int32 pageIndex = 0;
393
394	// allocate pages for the cache and mark them busy
395	for (generic_size_t pos = 0; pos < numBytes; pos += B_PAGE_SIZE) {
396		vm_page* page = pages[pageIndex++] = vm_page_allocate_page(
397			reservation, PAGE_STATE_CACHED | VM_PAGE_ALLOC_BUSY);
398
399		cache->InsertPage(page, offset + pos);
400
401		add_to_iovec(vecs, vecCount, MAX_IO_VECS,
402			page->physical_page_number * B_PAGE_SIZE, B_PAGE_SIZE);
403			// TODO: check if the array is large enough (currently panics)!
404	}
405
406	push_access(ref, offset, bufferSize, false);
407	cache->Unlock();
408	vm_page_unreserve_pages(reservation);
409
410	// read file into reserved pages
411	status_t status = read_pages_and_clear_partial(ref, cookie, offset, vecs,
412		vecCount, B_PHYSICAL_IO_REQUEST, &numBytes);
413	if (status != B_OK) {
414		// reading failed, free allocated pages
415
416		dprintf("file_cache: read pages failed: %s\n", strerror(status));
417
418		cache->Lock();
419
420		for (int32 i = 0; i < pageIndex; i++) {
421			cache->NotifyPageEvents(pages[i], PAGE_EVENT_NOT_BUSY);
422			cache->RemovePage(pages[i]);
423			vm_page_set_state(pages[i], PAGE_STATE_FREE);
424		}
425
426		return status;
427	}
428
429	// copy the pages if needed and unmap them again
430
431	for (int32 i = 0; i < pageIndex; i++) {
432		if (useBuffer && bufferSize != 0) {
433			size_t bytes = min_c(bufferSize, (size_t)B_PAGE_SIZE - pageOffset);
434
435			vm_memcpy_from_physical((void*)buffer,
436				pages[i]->physical_page_number * B_PAGE_SIZE + pageOffset,
437				bytes, true);
438
439			buffer += bytes;
440			bufferSize -= bytes;
441			pageOffset = 0;
442		}
443	}
444
445	reserve_pages(ref, reservation, reservePages, false);
446	cache->Lock();
447
448	// make the pages accessible in the cache
449	for (int32 i = pageIndex; i-- > 0;) {
450		DEBUG_PAGE_ACCESS_END(pages[i]);
451
452		cache->MarkPageUnbusy(pages[i]);
453	}
454
455	return B_OK;
456}
457
458
459static status_t
460read_from_file(file_cache_ref* ref, void* cookie, off_t offset,
461	int32 pageOffset, addr_t buffer, size_t bufferSize, bool useBuffer,
462	vm_page_reservation* reservation, size_t reservePages)
463{
464	TRACE(("read_from_file(offset = %Ld, pageOffset = %ld, buffer = %#lx, "
465		"bufferSize = %lu\n", offset, pageOffset, buffer, bufferSize));
466
467	if (!useBuffer)
468		return B_OK;
469
470	generic_io_vec vec;
471	vec.base = buffer;
472	vec.length = bufferSize;
473
474	push_access(ref, offset, bufferSize, false);
475	ref->cache->Unlock();
476	vm_page_unreserve_pages(reservation);
477
478	generic_size_t toRead = bufferSize;
479	status_t status = vfs_read_pages(ref->vnode, cookie, offset + pageOffset,
480		&vec, 1, 0, &toRead);
481
482	if (status == B_OK)
483		reserve_pages(ref, reservation, reservePages, false);
484
485	ref->cache->Lock();
486
487	return status;
488}
489
490
491/*!	Like read_into_cache() but writes data into the cache.
492	To preserve data consistency, it might also read pages into the cache,
493	though, if only a partial page gets written.
494	The same restrictions apply.
495*/
496static status_t
497write_to_cache(file_cache_ref* ref, void* cookie, off_t offset,
498	int32 pageOffset, addr_t buffer, size_t bufferSize, bool useBuffer,
499	vm_page_reservation* reservation, size_t reservePages)
500{
501	// TODO: We're using way too much stack! Rather allocate a sufficiently
502	// large chunk on the heap.
503	generic_io_vec vecs[MAX_IO_VECS];
504	uint32 vecCount = 0;
505	generic_size_t numBytes = PAGE_ALIGN(pageOffset + bufferSize);
506	vm_page* pages[MAX_IO_VECS];
507	int32 pageIndex = 0;
508	status_t status = B_OK;
509
510	// ToDo: this should be settable somewhere
511	bool writeThrough = false;
512
513	// allocate pages for the cache and mark them busy
514	for (generic_size_t pos = 0; pos < numBytes; pos += B_PAGE_SIZE) {
515		// TODO: if space is becoming tight, and this cache is already grown
516		//	big - shouldn't we better steal the pages directly in that case?
517		//	(a working set like approach for the file cache)
518		// TODO: the pages we allocate here should have been reserved upfront
519		//	in cache_io()
520		vm_page* page = pages[pageIndex++] = vm_page_allocate_page(
521			reservation,
522			(writeThrough ? PAGE_STATE_CACHED : PAGE_STATE_MODIFIED)
523				| VM_PAGE_ALLOC_BUSY);
524
525		page->modified = !writeThrough;
526
527		ref->cache->InsertPage(page, offset + pos);
528
529		add_to_iovec(vecs, vecCount, MAX_IO_VECS,
530			page->physical_page_number * B_PAGE_SIZE, B_PAGE_SIZE);
531	}
532
533	push_access(ref, offset, bufferSize, true);
534	ref->cache->Unlock();
535	vm_page_unreserve_pages(reservation);
536
537	// copy contents (and read in partially written pages first)
538
539	if (pageOffset != 0) {
540		// This is only a partial write, so we have to read the rest of the page
541		// from the file to have consistent data in the cache
542		generic_io_vec readVec = { vecs[0].base, B_PAGE_SIZE };
543		generic_size_t bytesRead = B_PAGE_SIZE;
544
545		status = vfs_read_pages(ref->vnode, cookie, offset, &readVec, 1,
546			B_PHYSICAL_IO_REQUEST, &bytesRead);
547		// ToDo: handle errors for real!
548		if (status < B_OK)
549			panic("1. vfs_read_pages() failed: %s!\n", strerror(status));
550	}
551
552	size_t lastPageOffset = (pageOffset + bufferSize) % B_PAGE_SIZE;
553	if (lastPageOffset != 0) {
554		// get the last page in the I/O vectors
555		generic_addr_t last = vecs[vecCount - 1].base
556			+ vecs[vecCount - 1].length - B_PAGE_SIZE;
557
558		if ((off_t)(offset + pageOffset + bufferSize) == ref->cache->virtual_end) {
559			// the space in the page after this write action needs to be cleaned
560			vm_memset_physical(last + lastPageOffset, 0,
561				B_PAGE_SIZE - lastPageOffset);
562		} else {
563			// the end of this write does not happen on a page boundary, so we
564			// need to fetch the last page before we can update it
565			generic_io_vec readVec = { last, B_PAGE_SIZE };
566			generic_size_t bytesRead = B_PAGE_SIZE;
567
568			status = vfs_read_pages(ref->vnode, cookie,
569				PAGE_ALIGN(offset + pageOffset + bufferSize) - B_PAGE_SIZE,
570				&readVec, 1, B_PHYSICAL_IO_REQUEST, &bytesRead);
571			// ToDo: handle errors for real!
572			if (status < B_OK)
573				panic("vfs_read_pages() failed: %s!\n", strerror(status));
574
575			if (bytesRead < B_PAGE_SIZE) {
576				// the space beyond the file size needs to be cleaned
577				vm_memset_physical(last + bytesRead, 0,
578					B_PAGE_SIZE - bytesRead);
579			}
580		}
581	}
582
583	for (uint32 i = 0; i < vecCount; i++) {
584		generic_addr_t base = vecs[i].base;
585		generic_size_t bytes = min_c((generic_size_t)bufferSize,
586			generic_size_t(vecs[i].length - pageOffset));
587
588		if (useBuffer) {
589			// copy data from user buffer
590			vm_memcpy_to_physical(base + pageOffset, (void*)buffer, bytes,
591				true);
592		} else {
593			// clear buffer instead
594			vm_memset_physical(base + pageOffset, 0, bytes);
595		}
596
597		bufferSize -= bytes;
598		if (bufferSize == 0)
599			break;
600
601		buffer += bytes;
602		pageOffset = 0;
603	}
604
605	if (writeThrough) {
606		// write cached pages back to the file if we were asked to do that
607		status_t status = vfs_write_pages(ref->vnode, cookie, offset, vecs,
608			vecCount, B_PHYSICAL_IO_REQUEST, &numBytes);
609		if (status < B_OK) {
610			// ToDo: remove allocated pages, ...?
611			panic("file_cache: remove allocated pages! write pages failed: %s\n",
612				strerror(status));
613		}
614	}
615
616	if (status == B_OK)
617		reserve_pages(ref, reservation, reservePages, true);
618
619	ref->cache->Lock();
620
621	// make the pages accessible in the cache
622	for (int32 i = pageIndex; i-- > 0;) {
623		ref->cache->MarkPageUnbusy(pages[i]);
624
625		DEBUG_PAGE_ACCESS_END(pages[i]);
626	}
627
628	return status;
629}
630
631
632static status_t
633write_to_file(file_cache_ref* ref, void* cookie, off_t offset, int32 pageOffset,
634	addr_t buffer, size_t bufferSize, bool useBuffer,
635	vm_page_reservation* reservation, size_t reservePages)
636{
637	push_access(ref, offset, bufferSize, true);
638	ref->cache->Unlock();
639	vm_page_unreserve_pages(reservation);
640
641	status_t status = B_OK;
642
643	if (!useBuffer) {
644		while (bufferSize > 0) {
645			generic_size_t written = min_c(bufferSize, kZeroVecSize);
646			status = vfs_write_pages(ref->vnode, cookie, offset + pageOffset,
647				sZeroVecs, kZeroVecCount, B_PHYSICAL_IO_REQUEST, &written);
648			if (status != B_OK)
649				return status;
650			if (written == 0)
651				return B_ERROR;
652
653			bufferSize -= written;
654			pageOffset += written;
655		}
656	} else {
657		generic_io_vec vec;
658		vec.base = buffer;
659		vec.length = bufferSize;
660		generic_size_t toWrite = bufferSize;
661		status = vfs_write_pages(ref->vnode, cookie, offset + pageOffset,
662			&vec, 1, 0, &toWrite);
663	}
664
665	if (status == B_OK)
666		reserve_pages(ref, reservation, reservePages, true);
667
668	ref->cache->Lock();
669
670	return status;
671}
672
673
674static inline status_t
675satisfy_cache_io(file_cache_ref* ref, void* cookie, cache_func function,
676	off_t offset, addr_t buffer, bool useBuffer, int32 &pageOffset,
677	size_t bytesLeft, size_t &reservePages, off_t &lastOffset,
678	addr_t &lastBuffer, int32 &lastPageOffset, size_t &lastLeft,
679	size_t &lastReservedPages, vm_page_reservation* reservation)
680{
681	if (lastBuffer == buffer)
682		return B_OK;
683
684	size_t requestSize = buffer - lastBuffer;
685	reservePages = min_c(MAX_IO_VECS, (lastLeft - requestSize
686		+ lastPageOffset + B_PAGE_SIZE - 1) >> PAGE_SHIFT);
687
688	status_t status = function(ref, cookie, lastOffset, lastPageOffset,
689		lastBuffer, requestSize, useBuffer, reservation, reservePages);
690	if (status == B_OK) {
691		lastReservedPages = reservePages;
692		lastBuffer = buffer;
693		lastLeft = bytesLeft;
694		lastOffset = offset;
695		lastPageOffset = 0;
696		pageOffset = 0;
697	}
698	return status;
699}
700
701
702static status_t
703cache_io(void* _cacheRef, void* cookie, off_t offset, addr_t buffer,
704	size_t* _size, bool doWrite)
705{
706	if (_cacheRef == NULL)
707		panic("cache_io() called with NULL ref!\n");
708
709	file_cache_ref* ref = (file_cache_ref*)_cacheRef;
710	VMCache* cache = ref->cache;
711	off_t fileSize = cache->virtual_end;
712	bool useBuffer = buffer != 0;
713
714	TRACE(("cache_io(ref = %p, offset = %Ld, buffer = %p, size = %lu, %s)\n",
715		ref, offset, (void*)buffer, *_size, doWrite ? "write" : "read"));
716
717	// out of bounds access?
718	if (offset >= fileSize || offset < 0) {
719		*_size = 0;
720		return B_OK;
721	}
722
723	int32 pageOffset = offset & (B_PAGE_SIZE - 1);
724	size_t size = *_size;
725	offset -= pageOffset;
726
727	if ((off_t)(offset + pageOffset + size) > fileSize) {
728		// adapt size to be within the file's offsets
729		size = fileSize - pageOffset - offset;
730		*_size = size;
731	}
732	if (size == 0)
733		return B_OK;
734
735	// "offset" and "lastOffset" are always aligned to B_PAGE_SIZE,
736	// the "last*" variables always point to the end of the last
737	// satisfied request part
738
739	const uint32 kMaxChunkSize = MAX_IO_VECS * B_PAGE_SIZE;
740	size_t bytesLeft = size, lastLeft = size;
741	int32 lastPageOffset = pageOffset;
742	addr_t lastBuffer = buffer;
743	off_t lastOffset = offset;
744	size_t lastReservedPages = min_c(MAX_IO_VECS, (pageOffset + bytesLeft
745		+ B_PAGE_SIZE - 1) >> PAGE_SHIFT);
746	size_t reservePages = 0;
747	size_t pagesProcessed = 0;
748	cache_func function = NULL;
749
750	vm_page_reservation reservation;
751	reserve_pages(ref, &reservation, lastReservedPages, doWrite);
752
753	AutoLocker<VMCache> locker(cache);
754
755	while (bytesLeft > 0) {
756		// Periodically reevaluate the low memory situation and select the
757		// read/write hook accordingly
758		if (pagesProcessed % 32 == 0) {
759			if (size >= BYPASS_IO_SIZE
760				&& low_resource_state(B_KERNEL_RESOURCE_PAGES)
761					!= B_NO_LOW_RESOURCE) {
762				// In low memory situations we bypass the cache beyond a
763				// certain I/O size.
764				function = doWrite ? write_to_file : read_from_file;
765			} else
766				function = doWrite ? write_to_cache : read_into_cache;
767		}
768
769		// check if this page is already in memory
770		vm_page* page = cache->LookupPage(offset);
771		if (page != NULL) {
772			// The page may be busy - since we need to unlock the cache sometime
773			// in the near future, we need to satisfy the request of the pages
774			// we didn't get yet (to make sure no one else interferes in the
775			// meantime).
776			status_t status = satisfy_cache_io(ref, cookie, function, offset,
777				buffer, useBuffer, pageOffset, bytesLeft, reservePages,
778				lastOffset, lastBuffer, lastPageOffset, lastLeft,
779				lastReservedPages, &reservation);
780			if (status != B_OK)
781				return status;
782
783			// Since satisfy_cache_io() unlocks the cache, we need to look up
784			// the page again.
785			page = cache->LookupPage(offset);
786			if (page != NULL && page->busy) {
787				cache->WaitForPageEvents(page, PAGE_EVENT_NOT_BUSY, true);
788				continue;
789			}
790		}
791
792		size_t bytesInPage = min_c(size_t(B_PAGE_SIZE - pageOffset), bytesLeft);
793
794		TRACE(("lookup page from offset %Ld: %p, size = %lu, pageOffset "
795			"= %lu\n", offset, page, bytesLeft, pageOffset));
796
797		if (page != NULL) {
798			if (doWrite || useBuffer) {
799				// Since the following user_mem{cpy,set}() might cause a page
800				// fault, which in turn might cause pages to be reserved, we
801				// need to unlock the cache temporarily to avoid a potential
802				// deadlock. To make sure that our page doesn't go away, we mark
803				// it busy for the time.
804				page->busy = true;
805				locker.Unlock();
806
807				// copy the contents of the page already in memory
808				phys_addr_t pageAddress
809					= (phys_addr_t)page->physical_page_number * B_PAGE_SIZE
810						+ pageOffset;
811				if (doWrite) {
812					if (useBuffer) {
813						vm_memcpy_to_physical(pageAddress, (void*)buffer,
814							bytesInPage, true);
815					} else {
816						vm_memset_physical(pageAddress, 0, bytesInPage);
817					}
818				} else if (useBuffer) {
819					vm_memcpy_from_physical((void*)buffer, pageAddress,
820						bytesInPage, true);
821				}
822
823				locker.Lock();
824
825				if (doWrite) {
826					DEBUG_PAGE_ACCESS_START(page);
827
828					page->modified = true;
829
830					if (page->State() != PAGE_STATE_MODIFIED)
831						vm_page_set_state(page, PAGE_STATE_MODIFIED);
832
833					DEBUG_PAGE_ACCESS_END(page);
834				}
835
836				cache->MarkPageUnbusy(page);
837			}
838
839			// If it is cached only, requeue the page, so the respective queue
840			// roughly remains LRU first sorted.
841			if (page->State() == PAGE_STATE_CACHED
842					|| page->State() == PAGE_STATE_MODIFIED) {
843				DEBUG_PAGE_ACCESS_START(page);
844				vm_page_requeue(page, true);
845				DEBUG_PAGE_ACCESS_END(page);
846			}
847
848			if (bytesLeft <= bytesInPage) {
849				// we've read the last page, so we're done!
850				locker.Unlock();
851				vm_page_unreserve_pages(&reservation);
852				return B_OK;
853			}
854
855			// prepare a potential gap request
856			lastBuffer = buffer + bytesInPage;
857			lastLeft = bytesLeft - bytesInPage;
858			lastOffset = offset + B_PAGE_SIZE;
859			lastPageOffset = 0;
860		}
861
862		if (bytesLeft <= bytesInPage)
863			break;
864
865		buffer += bytesInPage;
866		bytesLeft -= bytesInPage;
867		pageOffset = 0;
868		offset += B_PAGE_SIZE;
869		pagesProcessed++;
870
871		if (buffer - lastBuffer + lastPageOffset >= kMaxChunkSize) {
872			status_t status = satisfy_cache_io(ref, cookie, function, offset,
873				buffer, useBuffer, pageOffset, bytesLeft, reservePages,
874				lastOffset, lastBuffer, lastPageOffset, lastLeft,
875				lastReservedPages, &reservation);
876			if (status != B_OK)
877				return status;
878		}
879	}
880
881	// fill the last remaining bytes of the request (either write or read)
882
883	return function(ref, cookie, lastOffset, lastPageOffset, lastBuffer,
884		lastLeft, useBuffer, &reservation, 0);
885}
886
887
888static status_t
889file_cache_control(const char* subsystem, uint32 function, void* buffer,
890	size_t bufferSize)
891{
892	switch (function) {
893		case CACHE_CLEAR:
894			// ToDo: clear the cache
895			dprintf("cache_control: clear cache!\n");
896			return B_OK;
897
898		case CACHE_SET_MODULE:
899		{
900			cache_module_info* module = sCacheModule;
901
902			// unset previous module
903
904			if (sCacheModule != NULL) {
905				sCacheModule = NULL;
906				snooze(100000);	// 0.1 secs
907				put_module(module->info.name);
908			}
909
910			// get new module, if any
911
912			if (buffer == NULL)
913				return B_OK;
914
915			char name[B_FILE_NAME_LENGTH];
916			if (!IS_USER_ADDRESS(buffer)
917				|| user_strlcpy(name, (char*)buffer,
918						B_FILE_NAME_LENGTH) < B_OK)
919				return B_BAD_ADDRESS;
920
921			if (strncmp(name, CACHE_MODULES_NAME, strlen(CACHE_MODULES_NAME)))
922				return B_BAD_VALUE;
923
924			dprintf("cache_control: set module %s!\n", name);
925
926			status_t status = get_module(name, (module_info**)&module);
927			if (status == B_OK)
928				sCacheModule = module;
929
930			return status;
931		}
932	}
933
934	return B_BAD_HANDLER;
935}
936
937
938//	#pragma mark - private kernel API
939
940
941extern "C" void
942cache_prefetch_vnode(struct vnode* vnode, off_t offset, size_t size)
943{
944	if (size == 0)
945		return;
946
947	VMCache* cache;
948	if (vfs_get_vnode_cache(vnode, &cache, false) != B_OK)
949		return;
950
951	file_cache_ref* ref = ((VMVnodeCache*)cache)->FileCacheRef();
952	off_t fileSize = cache->virtual_end;
953
954	if ((off_t)(offset + size) > fileSize)
955		size = fileSize - offset;
956
957	// "offset" and "size" are always aligned to B_PAGE_SIZE,
958	offset = ROUNDDOWN(offset, B_PAGE_SIZE);
959	size = ROUNDUP(size, B_PAGE_SIZE);
960
961	size_t reservePages = size / B_PAGE_SIZE;
962
963	// Don't do anything if we don't have the resources left, or the cache
964	// already contains more than 2/3 of its pages
965	if (offset >= fileSize || vm_page_num_unused_pages() < 2 * reservePages
966		|| 3 * cache->page_count > 2 * fileSize / B_PAGE_SIZE) {
967		cache->ReleaseRef();
968		return;
969	}
970
971	size_t bytesToRead = 0;
972	off_t lastOffset = offset;
973
974	vm_page_reservation reservation;
975	vm_page_reserve_pages(&reservation, reservePages, VM_PRIORITY_USER);
976
977	cache->Lock();
978
979	while (true) {
980		// check if this page is already in memory
981		if (size > 0) {
982			vm_page* page = cache->LookupPage(offset);
983
984			offset += B_PAGE_SIZE;
985			size -= B_PAGE_SIZE;
986
987			if (page == NULL) {
988				bytesToRead += B_PAGE_SIZE;
989				continue;
990			}
991		}
992		if (bytesToRead != 0) {
993			// read the part before the current page (or the end of the request)
994			PrecacheIO* io = new(std::nothrow) PrecacheIO(ref, lastOffset,
995				bytesToRead);
996			if (io == NULL || io->Prepare(&reservation) != B_OK) {
997				delete io;
998				break;
999			}
1000
1001			// we must not have the cache locked during I/O
1002			cache->Unlock();
1003			io->ReadAsync();
1004			cache->Lock();
1005
1006			bytesToRead = 0;
1007		}
1008
1009		if (size == 0) {
1010			// we have reached the end of the request
1011			break;
1012		}
1013
1014		lastOffset = offset;
1015	}
1016
1017	cache->ReleaseRefAndUnlock();
1018	vm_page_unreserve_pages(&reservation);
1019}
1020
1021
1022extern "C" void
1023cache_prefetch(dev_t mountID, ino_t vnodeID, off_t offset, size_t size)
1024{
1025	// ToDo: schedule prefetch
1026
1027	TRACE(("cache_prefetch(vnode %ld:%Ld)\n", mountID, vnodeID));
1028
1029	// get the vnode for the object, this also grabs a ref to it
1030	struct vnode* vnode;
1031	if (vfs_get_vnode(mountID, vnodeID, true, &vnode) != B_OK)
1032		return;
1033
1034	cache_prefetch_vnode(vnode, offset, size);
1035	vfs_put_vnode(vnode);
1036}
1037
1038
1039extern "C" void
1040cache_node_opened(struct vnode* vnode, int32 fdType, VMCache* cache,
1041	dev_t mountID, ino_t parentID, ino_t vnodeID, const char* name)
1042{
1043	if (sCacheModule == NULL || sCacheModule->node_opened == NULL)
1044		return;
1045
1046	off_t size = -1;
1047	if (cache != NULL) {
1048		file_cache_ref* ref = ((VMVnodeCache*)cache)->FileCacheRef();
1049		if (ref != NULL)
1050			size = cache->virtual_end;
1051	}
1052
1053	sCacheModule->node_opened(vnode, fdType, mountID, parentID, vnodeID, name,
1054		size);
1055}
1056
1057
1058extern "C" void
1059cache_node_closed(struct vnode* vnode, int32 fdType, VMCache* cache,
1060	dev_t mountID, ino_t vnodeID)
1061{
1062	if (sCacheModule == NULL || sCacheModule->node_closed == NULL)
1063		return;
1064
1065	int32 accessType = 0;
1066	if (cache != NULL) {
1067		// ToDo: set accessType
1068	}
1069
1070	sCacheModule->node_closed(vnode, fdType, mountID, vnodeID, accessType);
1071}
1072
1073
1074extern "C" void
1075cache_node_launched(size_t argCount, char*  const* args)
1076{
1077	if (sCacheModule == NULL || sCacheModule->node_launched == NULL)
1078		return;
1079
1080	sCacheModule->node_launched(argCount, args);
1081}
1082
1083
1084extern "C" status_t
1085file_cache_init_post_boot_device(void)
1086{
1087	// ToDo: get cache module out of driver settings
1088
1089	if (get_module("file_cache/launch_speedup/v1",
1090			(module_info**)&sCacheModule) == B_OK) {
1091		dprintf("** opened launch speedup: %" B_PRId64 "\n", system_time());
1092	}
1093	return B_OK;
1094}
1095
1096
1097extern "C" status_t
1098file_cache_init(void)
1099{
1100	// allocate a clean page we can use for writing zeroes
1101	vm_page_reservation reservation;
1102	vm_page_reserve_pages(&reservation, 1, VM_PRIORITY_SYSTEM);
1103	vm_page* page = vm_page_allocate_page(&reservation,
1104		PAGE_STATE_WIRED | VM_PAGE_ALLOC_CLEAR);
1105	vm_page_unreserve_pages(&reservation);
1106
1107	sZeroPage = (phys_addr_t)page->physical_page_number * B_PAGE_SIZE;
1108
1109	for (uint32 i = 0; i < kZeroVecCount; i++) {
1110		sZeroVecs[i].base = sZeroPage;
1111		sZeroVecs[i].length = B_PAGE_SIZE;
1112	}
1113
1114	register_generic_syscall(CACHE_SYSCALLS, file_cache_control, 1, 0);
1115	return B_OK;
1116}
1117
1118
1119//	#pragma mark - public FS API
1120
1121
1122extern "C" void*
1123file_cache_create(dev_t mountID, ino_t vnodeID, off_t size)
1124{
1125	TRACE(("file_cache_create(mountID = %ld, vnodeID = %Ld, size = %Ld)\n",
1126		mountID, vnodeID, size));
1127
1128	file_cache_ref* ref = new file_cache_ref;
1129	if (ref == NULL)
1130		return NULL;
1131
1132	memset(ref->last_access, 0, sizeof(ref->last_access));
1133	ref->last_access_index = 0;
1134	ref->disabled_count = 0;
1135
1136	// TODO: delay VMCache creation until data is
1137	//	requested/written for the first time? Listing lots of
1138	//	files in Tracker (and elsewhere) could be slowed down.
1139	//	Since the file_cache_ref itself doesn't have a lock,
1140	//	we would need to "rent" one during construction, possibly
1141	//	the vnode lock, maybe a dedicated one.
1142	//	As there shouldn't be too much contention, we could also
1143	//	use atomic_test_and_set(), and free the resources again
1144	//	when that fails...
1145
1146	// Get the vnode for the object
1147	// (note, this does not grab a reference to the node)
1148	if (vfs_lookup_vnode(mountID, vnodeID, &ref->vnode) != B_OK)
1149		goto err1;
1150
1151	// Gets (usually creates) the cache for the node
1152	if (vfs_get_vnode_cache(ref->vnode, &ref->cache, true) != B_OK)
1153		goto err1;
1154
1155	ref->cache->virtual_end = size;
1156	((VMVnodeCache*)ref->cache)->SetFileCacheRef(ref);
1157	return ref;
1158
1159err1:
1160	delete ref;
1161	return NULL;
1162}
1163
1164
1165extern "C" void
1166file_cache_delete(void* _cacheRef)
1167{
1168	file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1169
1170	if (ref == NULL)
1171		return;
1172
1173	TRACE(("file_cache_delete(ref = %p)\n", ref));
1174
1175	ref->cache->ReleaseRef();
1176	delete ref;
1177}
1178
1179
1180extern "C" void
1181file_cache_enable(void* _cacheRef)
1182{
1183	file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1184
1185	AutoLocker<VMCache> _(ref->cache);
1186
1187	if (ref->disabled_count == 0) {
1188		panic("Unbalanced file_cache_enable()!");
1189		return;
1190	}
1191
1192	ref->disabled_count--;
1193}
1194
1195
1196extern "C" status_t
1197file_cache_disable(void* _cacheRef)
1198{
1199	// TODO: This function only removes all pages from the cache and prevents
1200	// that the file cache functions add any new ones until re-enabled. The
1201	// VM (on page fault) can still add pages, if the file is mmap()ed. We
1202	// should mark the cache to prevent shared mappings of the file and fix
1203	// the page fault code to deal correctly with private mappings (i.e. only
1204	// insert pages in consumer caches).
1205
1206	file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1207
1208	AutoLocker<VMCache> _(ref->cache);
1209
1210	// If already disabled, there's nothing to do for us.
1211	if (ref->disabled_count > 0) {
1212		ref->disabled_count++;
1213		return B_OK;
1214	}
1215
1216	// The file cache is not yet disabled. We need to evict all cached pages.
1217	status_t error = ref->cache->FlushAndRemoveAllPages();
1218	if (error != B_OK)
1219		return error;
1220
1221	ref->disabled_count++;
1222	return B_OK;
1223}
1224
1225
1226extern "C" bool
1227file_cache_is_enabled(void* _cacheRef)
1228{
1229	file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1230	AutoLocker<VMCache> _(ref->cache);
1231
1232	return ref->disabled_count == 0;
1233}
1234
1235
1236extern "C" status_t
1237file_cache_set_size(void* _cacheRef, off_t newSize)
1238{
1239	file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1240
1241	TRACE(("file_cache_set_size(ref = %p, size = %Ld)\n", ref, newSize));
1242
1243	if (ref == NULL)
1244		return B_OK;
1245
1246	VMCache* cache = ref->cache;
1247	AutoLocker<VMCache> _(cache);
1248
1249	off_t oldSize = cache->virtual_end;
1250	status_t status = cache->Resize(newSize, VM_PRIORITY_USER);
1251		// Note, the priority doesn't really matter, since this cache doesn't
1252		// reserve any memory.
1253	if (status == B_OK && newSize < oldSize) {
1254		// We may have a new partial page at the end of the cache that must be
1255		// cleared.
1256		uint32 partialBytes = newSize % B_PAGE_SIZE;
1257		if (partialBytes != 0) {
1258			vm_page* page = cache->LookupPage(newSize - partialBytes);
1259			if (page != NULL) {
1260				vm_memset_physical(page->physical_page_number * B_PAGE_SIZE
1261					+ partialBytes, 0, B_PAGE_SIZE - partialBytes);
1262			}
1263		}
1264	}
1265
1266	return status;
1267}
1268
1269
1270extern "C" status_t
1271file_cache_sync(void* _cacheRef)
1272{
1273	file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1274	if (ref == NULL)
1275		return B_BAD_VALUE;
1276
1277	return ref->cache->WriteModified();
1278}
1279
1280
1281extern "C" status_t
1282file_cache_read(void* _cacheRef, void* cookie, off_t offset, void* buffer,
1283	size_t* _size)
1284{
1285	file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1286
1287	TRACE(("file_cache_read(ref = %p, offset = %Ld, buffer = %p, size = %lu)\n",
1288		ref, offset, buffer, *_size));
1289
1290	if (ref->disabled_count > 0) {
1291		// Caching is disabled -- read directly from the file.
1292		generic_io_vec vec;
1293		vec.base = (addr_t)buffer;
1294		generic_size_t size = vec.length = *_size;
1295		status_t error = vfs_read_pages(ref->vnode, cookie, offset, &vec, 1, 0,
1296			&size);
1297		*_size = size;
1298		return error;
1299	}
1300
1301	return cache_io(ref, cookie, offset, (addr_t)buffer, _size, false);
1302}
1303
1304
1305extern "C" status_t
1306file_cache_write(void* _cacheRef, void* cookie, off_t offset,
1307	const void* buffer, size_t* _size)
1308{
1309	file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1310
1311	if (ref->disabled_count > 0) {
1312		// Caching is disabled -- write directly to the file.
1313
1314		if (buffer != NULL) {
1315			generic_io_vec vec;
1316			vec.base = (addr_t)buffer;
1317			generic_size_t size = vec.length = *_size;
1318
1319			status_t error = vfs_write_pages(ref->vnode, cookie, offset, &vec,
1320				1, 0, &size);
1321			*_size = size;
1322			return error;
1323		}
1324
1325		// NULL buffer -- use a dummy buffer to write zeroes
1326		size_t size = *_size;
1327		while (size > 0) {
1328			size_t toWrite = min_c(size, kZeroVecSize);
1329			generic_size_t written = toWrite;
1330			status_t error = vfs_write_pages(ref->vnode, cookie, offset,
1331				sZeroVecs, kZeroVecCount, B_PHYSICAL_IO_REQUEST, &written);
1332			if (error != B_OK)
1333				return error;
1334			if (written == 0)
1335				break;
1336
1337			offset += written;
1338			size -= written;
1339		}
1340
1341		*_size -= size;
1342		return B_OK;
1343	}
1344
1345	status_t status = cache_io(ref, cookie, offset,
1346		(addr_t)const_cast<void*>(buffer), _size, true);
1347
1348	TRACE(("file_cache_write(ref = %p, offset = %Ld, buffer = %p, size = %lu)"
1349		" = %ld\n", ref, offset, buffer, *_size, status));
1350
1351	return status;
1352}
1353