1/*
2 * Copyright 2004-2009, Axel D��rfler, axeld@pinc-software.de.
3 * Distributed under the terms of the MIT License.
4 */
5
6
7#include "vnode_store.h"
8
9#include <unistd.h>
10#include <stdlib.h>
11#include <string.h>
12
13#include <KernelExport.h>
14#include <fs_cache.h>
15
16#include <condition_variable.h>
17#include <file_cache.h>
18#include <generic_syscall.h>
19#include <low_resource_manager.h>
20#include <thread.h>
21#include <util/AutoLock.h>
22#include <util/kernel_cpp.h>
23#include <vfs.h>
24#include <vm/vm.h>
25#include <vm/vm_page.h>
26#include <vm/VMCache.h>
27
28#include "IORequest.h"
29
30
31//#define TRACE_FILE_CACHE
32#ifdef TRACE_FILE_CACHE
33#	define TRACE(x) dprintf x
34#else
35#	define TRACE(x) ;
36#endif
37
38// maximum number of iovecs per request
39#define MAX_IO_VECS			32	// 128 kB
40
41#define BYPASS_IO_SIZE		65536
42#define LAST_ACCESSES		3
43
44struct file_cache_ref {
45	VMCache			*cache;
46	struct vnode	*vnode;
47	off_t			last_access[LAST_ACCESSES];
48		// TODO: it would probably be enough to only store the least
49		//	significant 31 bits, and make this uint32 (one bit for
50		//	write vs. read)
51	int32			last_access_index;
52	uint16			disabled_count;
53
54	inline void SetLastAccess(int32 index, off_t access, bool isWrite)
55	{
56		// we remember writes as negative offsets
57		last_access[index] = isWrite ? -access : access;
58	}
59
60	inline off_t LastAccess(int32 index, bool isWrite) const
61	{
62		return isWrite ? -last_access[index] : last_access[index];
63	}
64
65	inline uint32 LastAccessPageOffset(int32 index, bool isWrite)
66	{
67		return LastAccess(index, isWrite) >> PAGE_SHIFT;
68	}
69};
70
71class PrecacheIO : public AsyncIOCallback {
72public:
73								PrecacheIO(file_cache_ref* ref, off_t offset,
74									generic_size_t size);
75								~PrecacheIO();
76
77			status_t			Prepare(vm_page_reservation* reservation);
78			void				ReadAsync();
79
80	virtual	void				IOFinished(status_t status,
81									bool partialTransfer,
82									generic_size_t bytesTransferred);
83
84private:
85			file_cache_ref*		fRef;
86			VMCache*			fCache;
87			vm_page**			fPages;
88			size_t				fPageCount;
89			ConditionVariable*	fBusyConditions;
90			generic_io_vec*		fVecs;
91			off_t				fOffset;
92			uint32				fVecCount;
93			generic_size_t		fSize;
94#if DEBUG_PAGE_ACCESS
95			thread_id			fAllocatingThread;
96#endif
97};
98
99typedef status_t (*cache_func)(file_cache_ref* ref, void* cookie, off_t offset,
100	int32 pageOffset, addr_t buffer, size_t bufferSize, bool useBuffer,
101	vm_page_reservation* reservation, size_t reservePages);
102
103static void add_to_iovec(generic_io_vec* vecs, uint32 &index, uint32 max,
104	generic_addr_t address, generic_size_t size);
105
106
107static struct cache_module_info* sCacheModule;
108
109
110static const uint32 kZeroVecCount = 32;
111static const size_t kZeroVecSize = kZeroVecCount * B_PAGE_SIZE;
112static phys_addr_t sZeroPage;
113static generic_io_vec sZeroVecs[kZeroVecCount];
114
115
116//	#pragma mark -
117
118
119PrecacheIO::PrecacheIO(file_cache_ref* ref, off_t offset, generic_size_t size)
120	:
121	fRef(ref),
122	fCache(ref->cache),
123	fPages(NULL),
124	fVecs(NULL),
125	fOffset(offset),
126	fVecCount(0),
127	fSize(size)
128{
129	fPageCount = (size + B_PAGE_SIZE - 1) / B_PAGE_SIZE;
130	fCache->AcquireRefLocked();
131}
132
133
134PrecacheIO::~PrecacheIO()
135{
136	delete[] fPages;
137	delete[] fVecs;
138	fCache->ReleaseRefLocked();
139}
140
141
142status_t
143PrecacheIO::Prepare(vm_page_reservation* reservation)
144{
145	if (fPageCount == 0)
146		return B_BAD_VALUE;
147
148	fPages = new(std::nothrow) vm_page*[fPageCount];
149	if (fPages == NULL)
150		return B_NO_MEMORY;
151
152	fVecs = new(std::nothrow) generic_io_vec[fPageCount];
153	if (fVecs == NULL)
154		return B_NO_MEMORY;
155
156	// allocate pages for the cache and mark them busy
157	uint32 i = 0;
158	for (generic_size_t pos = 0; pos < fSize; pos += B_PAGE_SIZE) {
159		vm_page* page = vm_page_allocate_page(reservation,
160			PAGE_STATE_CACHED | VM_PAGE_ALLOC_BUSY);
161
162		fCache->InsertPage(page, fOffset + pos);
163
164		add_to_iovec(fVecs, fVecCount, fPageCount,
165			page->physical_page_number * B_PAGE_SIZE, B_PAGE_SIZE);
166		fPages[i++] = page;
167	}
168
169#if DEBUG_PAGE_ACCESS
170	fAllocatingThread = find_thread(NULL);
171#endif
172
173	return B_OK;
174}
175
176
177void
178PrecacheIO::ReadAsync()
179{
180	// This object is going to be deleted after the I/O request has been
181	// fulfilled
182	vfs_asynchronous_read_pages(fRef->vnode, NULL, fOffset, fVecs, fVecCount,
183		fSize, B_PHYSICAL_IO_REQUEST, this);
184}
185
186
187void
188PrecacheIO::IOFinished(status_t status, bool partialTransfer,
189	generic_size_t bytesTransferred)
190{
191	AutoLocker<VMCache> locker(fCache);
192
193	// Make successfully loaded pages accessible again (partially
194	// transferred pages are considered failed)
195	phys_size_t pagesTransferred
196		= (bytesTransferred + B_PAGE_SIZE - 1) / B_PAGE_SIZE;
197
198	if (fOffset + (off_t)bytesTransferred > fCache->virtual_end)
199		bytesTransferred = fCache->virtual_end - fOffset;
200
201	for (uint32 i = 0; i < pagesTransferred; i++) {
202		if (i == pagesTransferred - 1
203			&& (bytesTransferred % B_PAGE_SIZE) != 0) {
204			// clear partial page
205			size_t bytesTouched = bytesTransferred % B_PAGE_SIZE;
206			vm_memset_physical(
207				((phys_addr_t)fPages[i]->physical_page_number << PAGE_SHIFT)
208					+ bytesTouched,
209				0, B_PAGE_SIZE - bytesTouched);
210		}
211
212		DEBUG_PAGE_ACCESS_TRANSFER(fPages[i], fAllocatingThread);
213
214		fCache->MarkPageUnbusy(fPages[i]);
215
216		DEBUG_PAGE_ACCESS_END(fPages[i]);
217	}
218
219	// Free pages after failed I/O
220	for (uint32 i = pagesTransferred; i < fPageCount; i++) {
221		DEBUG_PAGE_ACCESS_TRANSFER(fPages[i], fAllocatingThread);
222		fCache->NotifyPageEvents(fPages[i], PAGE_EVENT_NOT_BUSY);
223		fCache->RemovePage(fPages[i]);
224		vm_page_set_state(fPages[i], PAGE_STATE_FREE);
225	}
226
227	delete this;
228}
229
230
231//	#pragma mark -
232
233
234static void
235add_to_iovec(generic_io_vec* vecs, uint32 &index, uint32 max,
236	generic_addr_t address, generic_size_t size)
237{
238	if (index > 0 && vecs[index - 1].base + vecs[index - 1].length == address) {
239		// the iovec can be combined with the previous one
240		vecs[index - 1].length += size;
241		return;
242	}
243
244	if (index == max)
245		panic("no more space for iovecs!");
246
247	// we need to start a new iovec
248	vecs[index].base = address;
249	vecs[index].length = size;
250	index++;
251}
252
253
254static inline bool
255access_is_sequential(file_cache_ref* ref)
256{
257	return ref->last_access[ref->last_access_index] != 0;
258}
259
260
261static inline void
262push_access(file_cache_ref* ref, off_t offset, generic_size_t bytes,
263	bool isWrite)
264{
265	TRACE(("%p: push %lld, %ld, %s\n", ref, offset, bytes,
266		isWrite ? "write" : "read"));
267
268	int32 index = ref->last_access_index;
269	int32 previous = index - 1;
270	if (previous < 0)
271		previous = LAST_ACCESSES - 1;
272
273	if (offset != ref->LastAccess(previous, isWrite))
274		ref->last_access[previous] = 0;
275
276	ref->SetLastAccess(index, offset + bytes, isWrite);
277
278	if (++index >= LAST_ACCESSES)
279		index = 0;
280	ref->last_access_index = index;
281}
282
283
284static void
285reserve_pages(file_cache_ref* ref, vm_page_reservation* reservation,
286	size_t reservePages, bool isWrite)
287{
288	if (low_resource_state(B_KERNEL_RESOURCE_PAGES) != B_NO_LOW_RESOURCE) {
289		VMCache* cache = ref->cache;
290		cache->Lock();
291
292		if (cache->consumers.IsEmpty() && cache->areas == NULL
293			&& access_is_sequential(ref)) {
294			// we are not mapped, and we're accessed sequentially
295
296			if (isWrite) {
297				// Just write some pages back, and actually wait until they
298				// have been written back in order to relieve the page pressure
299				// a bit.
300				int32 index = ref->last_access_index;
301				int32 previous = index - 1;
302				if (previous < 0)
303					previous = LAST_ACCESSES - 1;
304
305				vm_page_write_modified_page_range(cache,
306					ref->LastAccessPageOffset(previous, true),
307					ref->LastAccessPageOffset(index, true));
308			} else {
309				// free some pages from our cache
310				// TODO: start with oldest
311				uint32 left = reservePages;
312				vm_page* page;
313				for (VMCachePagesTree::Iterator it = cache->pages.GetIterator();
314						(page = it.Next()) != NULL && left > 0;) {
315					if (page->State() == PAGE_STATE_CACHED && !page->busy) {
316						DEBUG_PAGE_ACCESS_START(page);
317						ASSERT(!page->IsMapped());
318						ASSERT(!page->modified);
319						cache->RemovePage(page);
320						vm_page_set_state(page, PAGE_STATE_FREE);
321						left--;
322					}
323				}
324			}
325		}
326		cache->Unlock();
327	}
328
329	vm_page_reserve_pages(reservation, reservePages, VM_PRIORITY_USER);
330}
331
332
333static inline status_t
334read_pages_and_clear_partial(file_cache_ref* ref, void* cookie, off_t offset,
335	const generic_io_vec* vecs, size_t count, uint32 flags,
336	generic_size_t* _numBytes)
337{
338	generic_size_t bytesUntouched = *_numBytes;
339
340	status_t status = vfs_read_pages(ref->vnode, cookie, offset, vecs, count,
341		flags, _numBytes);
342
343	generic_size_t bytesEnd = *_numBytes;
344
345	if (offset + (off_t)bytesEnd > ref->cache->virtual_end)
346		bytesEnd = ref->cache->virtual_end - offset;
347
348	if (status == B_OK && bytesEnd < bytesUntouched) {
349		// Clear out any leftovers that were not touched by the above read.
350		// We're doing this here so that not every file system/device has to
351		// implement this.
352		bytesUntouched -= bytesEnd;
353
354		for (int32 i = count; i-- > 0 && bytesUntouched != 0; ) {
355			generic_size_t length = min_c(bytesUntouched, vecs[i].length);
356			vm_memset_physical(vecs[i].base + vecs[i].length - length, 0,
357				length);
358
359			bytesUntouched -= length;
360		}
361	}
362
363	return status;
364}
365
366
367/*!	Reads the requested amount of data into the cache, and allocates
368	pages needed to fulfill that request. This function is called by cache_io().
369	It can only handle a certain amount of bytes, and the caller must make
370	sure that it matches that criterion.
371	The cache_ref lock must be held when calling this function; during
372	operation it will unlock the cache, though.
373*/
374static status_t
375read_into_cache(file_cache_ref* ref, void* cookie, off_t offset,
376	int32 pageOffset, addr_t buffer, size_t bufferSize, bool useBuffer,
377	vm_page_reservation* reservation, size_t reservePages)
378{
379	TRACE(("read_into_cache(offset = %lld, pageOffset = %ld, buffer = %#lx, "
380		"bufferSize = %lu\n", offset, pageOffset, buffer, bufferSize));
381
382	VMCache* cache = ref->cache;
383
384	// TODO: We're using way too much stack! Rather allocate a sufficiently
385	// large chunk on the heap.
386	generic_io_vec vecs[MAX_IO_VECS];
387	uint32 vecCount = 0;
388
389	generic_size_t numBytes = PAGE_ALIGN(pageOffset + bufferSize);
390	vm_page* pages[MAX_IO_VECS];
391	int32 pageIndex = 0;
392
393	// allocate pages for the cache and mark them busy
394	for (generic_size_t pos = 0; pos < numBytes; pos += B_PAGE_SIZE) {
395		vm_page* page = pages[pageIndex++] = vm_page_allocate_page(
396			reservation, PAGE_STATE_CACHED | VM_PAGE_ALLOC_BUSY);
397
398		cache->InsertPage(page, offset + pos);
399
400		add_to_iovec(vecs, vecCount, MAX_IO_VECS,
401			page->physical_page_number * B_PAGE_SIZE, B_PAGE_SIZE);
402			// TODO: check if the array is large enough (currently panics)!
403	}
404
405	push_access(ref, offset, bufferSize, false);
406	cache->Unlock();
407	vm_page_unreserve_pages(reservation);
408
409	// read file into reserved pages
410	status_t status = read_pages_and_clear_partial(ref, cookie, offset, vecs,
411		vecCount, B_PHYSICAL_IO_REQUEST, &numBytes);
412	if (status != B_OK) {
413		// reading failed, free allocated pages
414
415		dprintf("file_cache: read pages failed: %s\n", strerror(status));
416
417		cache->Lock();
418
419		for (int32 i = 0; i < pageIndex; i++) {
420			cache->NotifyPageEvents(pages[i], PAGE_EVENT_NOT_BUSY);
421			cache->RemovePage(pages[i]);
422			vm_page_set_state(pages[i], PAGE_STATE_FREE);
423		}
424
425		return status;
426	}
427
428	// copy the pages if needed and unmap them again
429
430	for (int32 i = 0; i < pageIndex; i++) {
431		if (useBuffer && bufferSize != 0) {
432			size_t bytes = min_c(bufferSize, (size_t)B_PAGE_SIZE - pageOffset);
433
434			vm_memcpy_from_physical((void*)buffer,
435				pages[i]->physical_page_number * B_PAGE_SIZE + pageOffset,
436				bytes, IS_USER_ADDRESS(buffer));
437
438			buffer += bytes;
439			bufferSize -= bytes;
440			pageOffset = 0;
441		}
442	}
443
444	reserve_pages(ref, reservation, reservePages, false);
445	cache->Lock();
446
447	// make the pages accessible in the cache
448	for (int32 i = pageIndex; i-- > 0;) {
449		DEBUG_PAGE_ACCESS_END(pages[i]);
450
451		cache->MarkPageUnbusy(pages[i]);
452	}
453
454	return B_OK;
455}
456
457
458static status_t
459read_from_file(file_cache_ref* ref, void* cookie, off_t offset,
460	int32 pageOffset, addr_t buffer, size_t bufferSize, bool useBuffer,
461	vm_page_reservation* reservation, size_t reservePages)
462{
463	TRACE(("read_from_file(offset = %lld, pageOffset = %ld, buffer = %#lx, "
464		"bufferSize = %lu\n", offset, pageOffset, buffer, bufferSize));
465
466	if (!useBuffer)
467		return B_OK;
468
469	generic_io_vec vec;
470	vec.base = buffer;
471	vec.length = bufferSize;
472
473	push_access(ref, offset, bufferSize, false);
474	ref->cache->Unlock();
475	vm_page_unreserve_pages(reservation);
476
477	generic_size_t toRead = bufferSize;
478	status_t status = vfs_read_pages(ref->vnode, cookie, offset + pageOffset,
479		&vec, 1, 0, &toRead);
480
481	if (status == B_OK)
482		reserve_pages(ref, reservation, reservePages, false);
483
484	ref->cache->Lock();
485
486	return status;
487}
488
489
490/*!	Like read_into_cache() but writes data into the cache.
491	To preserve data consistency, it might also read pages into the cache,
492	though, if only a partial page gets written.
493	The same restrictions apply.
494*/
495static status_t
496write_to_cache(file_cache_ref* ref, void* cookie, off_t offset,
497	int32 pageOffset, addr_t buffer, size_t bufferSize, bool useBuffer,
498	vm_page_reservation* reservation, size_t reservePages)
499{
500	// TODO: We're using way too much stack! Rather allocate a sufficiently
501	// large chunk on the heap.
502	generic_io_vec vecs[MAX_IO_VECS];
503	uint32 vecCount = 0;
504	generic_size_t numBytes = PAGE_ALIGN(pageOffset + bufferSize);
505	vm_page* pages[MAX_IO_VECS];
506	int32 pageIndex = 0;
507	status_t status = B_OK;
508
509	// ToDo: this should be settable somewhere
510	bool writeThrough = false;
511
512	// allocate pages for the cache and mark them busy
513	for (generic_size_t pos = 0; pos < numBytes; pos += B_PAGE_SIZE) {
514		// TODO: if space is becoming tight, and this cache is already grown
515		//	big - shouldn't we better steal the pages directly in that case?
516		//	(a working set like approach for the file cache)
517		// TODO: the pages we allocate here should have been reserved upfront
518		//	in cache_io()
519		vm_page* page = pages[pageIndex++] = vm_page_allocate_page(
520			reservation,
521			(writeThrough ? PAGE_STATE_CACHED : PAGE_STATE_MODIFIED)
522				| VM_PAGE_ALLOC_BUSY);
523
524		page->modified = !writeThrough;
525
526		ref->cache->InsertPage(page, offset + pos);
527
528		add_to_iovec(vecs, vecCount, MAX_IO_VECS,
529			page->physical_page_number * B_PAGE_SIZE, B_PAGE_SIZE);
530	}
531
532	push_access(ref, offset, bufferSize, true);
533	ref->cache->Unlock();
534	vm_page_unreserve_pages(reservation);
535
536	// copy contents (and read in partially written pages first)
537
538	if (pageOffset != 0) {
539		// This is only a partial write, so we have to read the rest of the page
540		// from the file to have consistent data in the cache
541		generic_io_vec readVec = { vecs[0].base, B_PAGE_SIZE };
542		generic_size_t bytesRead = B_PAGE_SIZE;
543
544		status = vfs_read_pages(ref->vnode, cookie, offset, &readVec, 1,
545			B_PHYSICAL_IO_REQUEST, &bytesRead);
546		// ToDo: handle errors for real!
547		if (status < B_OK)
548			panic("1. vfs_read_pages() failed: %s!\n", strerror(status));
549	}
550
551	size_t lastPageOffset = (pageOffset + bufferSize) % B_PAGE_SIZE;
552	if (lastPageOffset != 0) {
553		// get the last page in the I/O vectors
554		generic_addr_t last = vecs[vecCount - 1].base
555			+ vecs[vecCount - 1].length - B_PAGE_SIZE;
556
557		if ((off_t)(offset + pageOffset + bufferSize) == ref->cache->virtual_end) {
558			// the space in the page after this write action needs to be cleaned
559			vm_memset_physical(last + lastPageOffset, 0,
560				B_PAGE_SIZE - lastPageOffset);
561		} else {
562			// the end of this write does not happen on a page boundary, so we
563			// need to fetch the last page before we can update it
564			generic_io_vec readVec = { last, B_PAGE_SIZE };
565			generic_size_t bytesRead = B_PAGE_SIZE;
566
567			status = vfs_read_pages(ref->vnode, cookie,
568				PAGE_ALIGN(offset + pageOffset + bufferSize) - B_PAGE_SIZE,
569				&readVec, 1, B_PHYSICAL_IO_REQUEST, &bytesRead);
570			// ToDo: handle errors for real!
571			if (status < B_OK)
572				panic("vfs_read_pages() failed: %s!\n", strerror(status));
573
574			if (bytesRead < B_PAGE_SIZE) {
575				// the space beyond the file size needs to be cleaned
576				vm_memset_physical(last + bytesRead, 0,
577					B_PAGE_SIZE - bytesRead);
578			}
579		}
580	}
581
582	for (uint32 i = 0; i < vecCount; i++) {
583		generic_addr_t base = vecs[i].base;
584		generic_size_t bytes = min_c((generic_size_t)bufferSize,
585			generic_size_t(vecs[i].length - pageOffset));
586
587		if (useBuffer) {
588			// copy data from user buffer
589			vm_memcpy_to_physical(base + pageOffset, (void*)buffer, bytes,
590				IS_USER_ADDRESS(buffer));
591		} else {
592			// clear buffer instead
593			vm_memset_physical(base + pageOffset, 0, bytes);
594		}
595
596		bufferSize -= bytes;
597		if (bufferSize == 0)
598			break;
599
600		buffer += bytes;
601		pageOffset = 0;
602	}
603
604	if (writeThrough) {
605		// write cached pages back to the file if we were asked to do that
606		status_t status = vfs_write_pages(ref->vnode, cookie, offset, vecs,
607			vecCount, B_PHYSICAL_IO_REQUEST, &numBytes);
608		if (status < B_OK) {
609			// ToDo: remove allocated pages, ...?
610			panic("file_cache: remove allocated pages! write pages failed: %s\n",
611				strerror(status));
612		}
613	}
614
615	if (status == B_OK)
616		reserve_pages(ref, reservation, reservePages, true);
617
618	ref->cache->Lock();
619
620	// make the pages accessible in the cache
621	for (int32 i = pageIndex; i-- > 0;) {
622		ref->cache->MarkPageUnbusy(pages[i]);
623
624		DEBUG_PAGE_ACCESS_END(pages[i]);
625	}
626
627	return status;
628}
629
630
631static status_t
632write_zeros_to_file(struct vnode* vnode, void* cookie, off_t offset,
633	size_t* _size)
634{
635	size_t size = *_size;
636	status_t status = B_OK;
637	while (size > 0) {
638		generic_size_t length = min_c(size, kZeroVecSize);
639		generic_io_vec* vecs = sZeroVecs;
640		generic_io_vec vec;
641		size_t count = kZeroVecCount;
642		if (length != kZeroVecSize) {
643			if (length > B_PAGE_SIZE) {
644				length = ROUNDDOWN(length, B_PAGE_SIZE);
645				count = length / B_PAGE_SIZE;
646			} else {
647				vec.base = sZeroPage;
648				vec.length = length;
649				vecs = &vec;
650				count = 1;
651			}
652		}
653
654		status = vfs_write_pages(vnode, cookie, offset,
655			vecs, count, B_PHYSICAL_IO_REQUEST, &length);
656		if (status != B_OK || length == 0)
657			break;
658
659		offset += length;
660		size -= length;
661	}
662
663	*_size = *_size - size;
664	return status;
665}
666
667
668static status_t
669write_to_file(file_cache_ref* ref, void* cookie, off_t offset, int32 pageOffset,
670	addr_t buffer, size_t bufferSize, bool useBuffer,
671	vm_page_reservation* reservation, size_t reservePages)
672{
673	push_access(ref, offset, bufferSize, true);
674	ref->cache->Unlock();
675	vm_page_unreserve_pages(reservation);
676
677	status_t status = B_OK;
678
679	if (!useBuffer) {
680		status = write_zeros_to_file(ref->vnode, cookie, offset + pageOffset,
681			&bufferSize);
682	} else {
683		generic_io_vec vec;
684		vec.base = buffer;
685		vec.length = bufferSize;
686		generic_size_t toWrite = bufferSize;
687		status = vfs_write_pages(ref->vnode, cookie, offset + pageOffset,
688			&vec, 1, 0, &toWrite);
689	}
690
691	if (status == B_OK)
692		reserve_pages(ref, reservation, reservePages, true);
693
694	ref->cache->Lock();
695
696	return status;
697}
698
699
700static inline status_t
701satisfy_cache_io(file_cache_ref* ref, void* cookie, cache_func function,
702	off_t offset, addr_t buffer, bool useBuffer, int32 &pageOffset,
703	size_t bytesLeft, size_t &reservePages, off_t &lastOffset,
704	addr_t &lastBuffer, int32 &lastPageOffset, size_t &lastLeft,
705	size_t &lastReservedPages, vm_page_reservation* reservation)
706{
707	if (lastBuffer == buffer)
708		return B_OK;
709
710	size_t requestSize = buffer - lastBuffer;
711	reservePages = min_c(MAX_IO_VECS, (lastLeft - requestSize
712		+ lastPageOffset + B_PAGE_SIZE - 1) >> PAGE_SHIFT);
713
714	status_t status = function(ref, cookie, lastOffset, lastPageOffset,
715		lastBuffer, requestSize, useBuffer, reservation, reservePages);
716	if (status == B_OK) {
717		lastReservedPages = reservePages;
718		lastBuffer = buffer;
719		lastLeft = bytesLeft;
720		lastOffset = offset;
721		lastPageOffset = 0;
722		pageOffset = 0;
723	}
724	return status;
725}
726
727
728static status_t
729cache_io(void* _cacheRef, void* cookie, off_t offset, addr_t buffer,
730	size_t* _size, bool doWrite)
731{
732	if (_cacheRef == NULL)
733		panic("cache_io() called with NULL ref!\n");
734
735	file_cache_ref* ref = (file_cache_ref*)_cacheRef;
736	VMCache* cache = ref->cache;
737	bool useBuffer = buffer != 0;
738
739	TRACE(("cache_io(ref = %p, offset = %lld, buffer = %p, size = %lu, %s)\n",
740		ref, offset, (void*)buffer, *_size, doWrite ? "write" : "read"));
741
742	int32 pageOffset = offset & (B_PAGE_SIZE - 1);
743	size_t size = *_size;
744	offset -= pageOffset;
745
746	// "offset" and "lastOffset" are always aligned to B_PAGE_SIZE,
747	// the "last*" variables always point to the end of the last
748	// satisfied request part
749
750	const uint32 kMaxChunkSize = MAX_IO_VECS * B_PAGE_SIZE;
751	size_t bytesLeft = size, lastLeft = size;
752	int32 lastPageOffset = pageOffset;
753	addr_t lastBuffer = buffer;
754	off_t lastOffset = offset;
755	size_t lastReservedPages = min_c(MAX_IO_VECS, (pageOffset + bytesLeft
756		+ B_PAGE_SIZE - 1) >> PAGE_SHIFT);
757	size_t reservePages = 0;
758	size_t pagesProcessed = 0;
759	cache_func function = NULL;
760
761	vm_page_reservation reservation;
762	reserve_pages(ref, &reservation, lastReservedPages, doWrite);
763
764	AutoLocker<VMCache> locker(cache);
765
766	while (bytesLeft > 0) {
767		// Periodically reevaluate the low memory situation and select the
768		// read/write hook accordingly
769		if (pagesProcessed % 32 == 0) {
770			if (size >= BYPASS_IO_SIZE
771				&& low_resource_state(B_KERNEL_RESOURCE_PAGES)
772					!= B_NO_LOW_RESOURCE) {
773				// In low memory situations we bypass the cache beyond a
774				// certain I/O size.
775				function = doWrite ? write_to_file : read_from_file;
776			} else
777				function = doWrite ? write_to_cache : read_into_cache;
778		}
779
780		// check if this page is already in memory
781		vm_page* page = cache->LookupPage(offset);
782		if (page != NULL) {
783			// The page may be busy - since we need to unlock the cache sometime
784			// in the near future, we need to satisfy the request of the pages
785			// we didn't get yet (to make sure no one else interferes in the
786			// meantime).
787			status_t status = satisfy_cache_io(ref, cookie, function, offset,
788				buffer, useBuffer, pageOffset, bytesLeft, reservePages,
789				lastOffset, lastBuffer, lastPageOffset, lastLeft,
790				lastReservedPages, &reservation);
791			if (status != B_OK)
792				return status;
793
794			// Since satisfy_cache_io() unlocks the cache, we need to look up
795			// the page again.
796			page = cache->LookupPage(offset);
797			if (page != NULL && page->busy) {
798				cache->WaitForPageEvents(page, PAGE_EVENT_NOT_BUSY, true);
799				continue;
800			}
801		}
802
803		size_t bytesInPage = min_c(size_t(B_PAGE_SIZE - pageOffset), bytesLeft);
804
805		TRACE(("lookup page from offset %lld: %p, size = %lu, pageOffset "
806			"= %lu\n", offset, page, bytesLeft, pageOffset));
807
808		if (page != NULL) {
809			if (doWrite || useBuffer) {
810				// Since the following user_mem{cpy,set}() might cause a page
811				// fault, which in turn might cause pages to be reserved, we
812				// need to unlock the cache temporarily to avoid a potential
813				// deadlock. To make sure that our page doesn't go away, we mark
814				// it busy for the time.
815				page->busy = true;
816				locker.Unlock();
817
818				// copy the contents of the page already in memory
819				phys_addr_t pageAddress
820					= (phys_addr_t)page->physical_page_number * B_PAGE_SIZE
821						+ pageOffset;
822				bool userBuffer = IS_USER_ADDRESS(buffer);
823				if (doWrite) {
824					if (useBuffer) {
825						vm_memcpy_to_physical(pageAddress, (void*)buffer,
826							bytesInPage, userBuffer);
827					} else {
828						vm_memset_physical(pageAddress, 0, bytesInPage);
829					}
830				} else if (useBuffer) {
831					vm_memcpy_from_physical((void*)buffer, pageAddress,
832						bytesInPage, userBuffer);
833				}
834
835				locker.Lock();
836
837				if (doWrite) {
838					DEBUG_PAGE_ACCESS_START(page);
839
840					page->modified = true;
841
842					if (page->State() != PAGE_STATE_MODIFIED)
843						vm_page_set_state(page, PAGE_STATE_MODIFIED);
844
845					DEBUG_PAGE_ACCESS_END(page);
846				}
847
848				cache->MarkPageUnbusy(page);
849			}
850
851			// If it is cached only, requeue the page, so the respective queue
852			// roughly remains LRU first sorted.
853			if (page->State() == PAGE_STATE_CACHED
854					|| page->State() == PAGE_STATE_MODIFIED) {
855				DEBUG_PAGE_ACCESS_START(page);
856				vm_page_requeue(page, true);
857				DEBUG_PAGE_ACCESS_END(page);
858			}
859
860			if (bytesLeft <= bytesInPage) {
861				// we've read the last page, so we're done!
862				locker.Unlock();
863				vm_page_unreserve_pages(&reservation);
864				return B_OK;
865			}
866
867			// prepare a potential gap request
868			lastBuffer = buffer + bytesInPage;
869			lastLeft = bytesLeft - bytesInPage;
870			lastOffset = offset + B_PAGE_SIZE;
871			lastPageOffset = 0;
872		}
873
874		if (bytesLeft <= bytesInPage)
875			break;
876
877		buffer += bytesInPage;
878		bytesLeft -= bytesInPage;
879		pageOffset = 0;
880		offset += B_PAGE_SIZE;
881		pagesProcessed++;
882
883		if (buffer - lastBuffer + lastPageOffset >= kMaxChunkSize) {
884			status_t status = satisfy_cache_io(ref, cookie, function, offset,
885				buffer, useBuffer, pageOffset, bytesLeft, reservePages,
886				lastOffset, lastBuffer, lastPageOffset, lastLeft,
887				lastReservedPages, &reservation);
888			if (status != B_OK)
889				return status;
890		}
891	}
892
893	// fill the last remaining bytes of the request (either write or read)
894
895	return function(ref, cookie, lastOffset, lastPageOffset, lastBuffer,
896		lastLeft, useBuffer, &reservation, 0);
897}
898
899
900static status_t
901file_cache_control(const char* subsystem, uint32 function, void* buffer,
902	size_t bufferSize)
903{
904	switch (function) {
905		case CACHE_CLEAR:
906			// ToDo: clear the cache
907			dprintf("cache_control: clear cache!\n");
908			return B_OK;
909
910		case CACHE_SET_MODULE:
911		{
912			cache_module_info* module = sCacheModule;
913
914			// unset previous module
915
916			if (sCacheModule != NULL) {
917				sCacheModule = NULL;
918				snooze(100000);	// 0.1 secs
919				put_module(module->info.name);
920			}
921
922			// get new module, if any
923
924			if (buffer == NULL)
925				return B_OK;
926
927			char name[B_FILE_NAME_LENGTH];
928			if (!IS_USER_ADDRESS(buffer)
929				|| user_strlcpy(name, (char*)buffer,
930						B_FILE_NAME_LENGTH) < B_OK)
931				return B_BAD_ADDRESS;
932
933			if (strncmp(name, CACHE_MODULES_NAME, strlen(CACHE_MODULES_NAME)))
934				return B_BAD_VALUE;
935
936			dprintf("cache_control: set module %s!\n", name);
937
938			status_t status = get_module(name, (module_info**)&module);
939			if (status == B_OK)
940				sCacheModule = module;
941
942			return status;
943		}
944	}
945
946	return B_BAD_HANDLER;
947}
948
949
950//	#pragma mark - private kernel API
951
952
953extern "C" void
954cache_prefetch_vnode(struct vnode* vnode, off_t offset, size_t size)
955{
956	if (size == 0)
957		return;
958
959	VMCache* cache;
960	if (vfs_get_vnode_cache(vnode, &cache, false) != B_OK)
961		return;
962	if (cache->type != CACHE_TYPE_VNODE) {
963		cache->ReleaseRef();
964		return;
965	}
966
967	file_cache_ref* ref = ((VMVnodeCache*)cache)->FileCacheRef();
968	off_t fileSize = cache->virtual_end;
969
970	if ((off_t)(offset + size) > fileSize)
971		size = fileSize - offset;
972
973	// "offset" and "size" are always aligned to B_PAGE_SIZE,
974	offset = ROUNDDOWN(offset, B_PAGE_SIZE);
975	size = ROUNDUP(size, B_PAGE_SIZE);
976
977	size_t reservePages = size / B_PAGE_SIZE;
978
979	// Don't do anything if we don't have the resources left, or the cache
980	// already contains more than 2/3 of its pages
981	if (offset >= fileSize || vm_page_num_unused_pages() < 2 * reservePages
982		|| 3 * cache->page_count > 2 * fileSize / B_PAGE_SIZE) {
983		cache->ReleaseRef();
984		return;
985	}
986
987	size_t bytesToRead = 0;
988	off_t lastOffset = offset;
989
990	vm_page_reservation reservation;
991	vm_page_reserve_pages(&reservation, reservePages, VM_PRIORITY_USER);
992
993	cache->Lock();
994
995	while (true) {
996		// check if this page is already in memory
997		if (size > 0) {
998			vm_page* page = cache->LookupPage(offset);
999
1000			offset += B_PAGE_SIZE;
1001			size -= B_PAGE_SIZE;
1002
1003			if (page == NULL) {
1004				bytesToRead += B_PAGE_SIZE;
1005				continue;
1006			}
1007		}
1008		if (bytesToRead != 0) {
1009			// read the part before the current page (or the end of the request)
1010			PrecacheIO* io = new(std::nothrow) PrecacheIO(ref, lastOffset,
1011				bytesToRead);
1012			if (io == NULL || io->Prepare(&reservation) != B_OK) {
1013				delete io;
1014				break;
1015			}
1016
1017			// we must not have the cache locked during I/O
1018			cache->Unlock();
1019			io->ReadAsync();
1020			cache->Lock();
1021
1022			bytesToRead = 0;
1023		}
1024
1025		if (size == 0) {
1026			// we have reached the end of the request
1027			break;
1028		}
1029
1030		lastOffset = offset;
1031	}
1032
1033	cache->ReleaseRefAndUnlock();
1034	vm_page_unreserve_pages(&reservation);
1035}
1036
1037
1038extern "C" void
1039cache_prefetch(dev_t mountID, ino_t vnodeID, off_t offset, size_t size)
1040{
1041	// ToDo: schedule prefetch
1042
1043	TRACE(("cache_prefetch(vnode %ld:%lld)\n", mountID, vnodeID));
1044
1045	// get the vnode for the object, this also grabs a ref to it
1046	struct vnode* vnode;
1047	if (vfs_get_vnode(mountID, vnodeID, true, &vnode) != B_OK)
1048		return;
1049
1050	cache_prefetch_vnode(vnode, offset, size);
1051	vfs_put_vnode(vnode);
1052}
1053
1054
1055extern "C" void
1056cache_node_opened(struct vnode* vnode, int32 fdType, VMCache* cache,
1057	dev_t mountID, ino_t parentID, ino_t vnodeID, const char* name)
1058{
1059	if (sCacheModule == NULL || sCacheModule->node_opened == NULL)
1060		return;
1061
1062	off_t size = -1;
1063	if (cache != NULL && cache->type == CACHE_TYPE_VNODE) {
1064		file_cache_ref* ref = ((VMVnodeCache*)cache)->FileCacheRef();
1065		if (ref != NULL)
1066			size = cache->virtual_end;
1067	}
1068
1069	sCacheModule->node_opened(vnode, fdType, mountID, parentID, vnodeID, name,
1070		size);
1071}
1072
1073
1074extern "C" void
1075cache_node_closed(struct vnode* vnode, int32 fdType, VMCache* cache,
1076	dev_t mountID, ino_t vnodeID)
1077{
1078	if (sCacheModule == NULL || sCacheModule->node_closed == NULL)
1079		return;
1080
1081	int32 accessType = 0;
1082	if (cache != NULL && cache->type == CACHE_TYPE_VNODE) {
1083		// ToDo: set accessType
1084	}
1085
1086	sCacheModule->node_closed(vnode, fdType, mountID, vnodeID, accessType);
1087}
1088
1089
1090extern "C" void
1091cache_node_launched(size_t argCount, char*  const* args)
1092{
1093	if (sCacheModule == NULL || sCacheModule->node_launched == NULL)
1094		return;
1095
1096	sCacheModule->node_launched(argCount, args);
1097}
1098
1099
1100extern "C" status_t
1101file_cache_init_post_boot_device(void)
1102{
1103	// ToDo: get cache module out of driver settings
1104
1105	if (get_module("file_cache/launch_speedup/v1",
1106			(module_info**)&sCacheModule) == B_OK) {
1107		dprintf("** opened launch speedup: %" B_PRId64 "\n", system_time());
1108	}
1109	return B_OK;
1110}
1111
1112
1113extern "C" status_t
1114file_cache_init(void)
1115{
1116	// allocate a clean page we can use for writing zeroes
1117	vm_page_reservation reservation;
1118	vm_page_reserve_pages(&reservation, 1, VM_PRIORITY_SYSTEM);
1119	vm_page* page = vm_page_allocate_page(&reservation,
1120		PAGE_STATE_WIRED | VM_PAGE_ALLOC_CLEAR);
1121	vm_page_unreserve_pages(&reservation);
1122
1123	sZeroPage = (phys_addr_t)page->physical_page_number * B_PAGE_SIZE;
1124
1125	for (uint32 i = 0; i < kZeroVecCount; i++) {
1126		sZeroVecs[i].base = sZeroPage;
1127		sZeroVecs[i].length = B_PAGE_SIZE;
1128	}
1129
1130	register_generic_syscall(CACHE_SYSCALLS, file_cache_control, 1, 0);
1131	return B_OK;
1132}
1133
1134
1135//	#pragma mark - public FS API
1136
1137
1138extern "C" void*
1139file_cache_create(dev_t mountID, ino_t vnodeID, off_t size)
1140{
1141	TRACE(("file_cache_create(mountID = %ld, vnodeID = %lld, size = %lld)\n",
1142		mountID, vnodeID, size));
1143
1144	file_cache_ref* ref = new file_cache_ref;
1145	if (ref == NULL)
1146		return NULL;
1147
1148	memset(ref->last_access, 0, sizeof(ref->last_access));
1149	ref->last_access_index = 0;
1150	ref->disabled_count = 0;
1151
1152	// TODO: delay VMCache creation until data is
1153	//	requested/written for the first time? Listing lots of
1154	//	files in Tracker (and elsewhere) could be slowed down.
1155	//	Since the file_cache_ref itself doesn't have a lock,
1156	//	we would need to "rent" one during construction, possibly
1157	//	the vnode lock, maybe a dedicated one.
1158	//	As there shouldn't be too much contention, we could also
1159	//	use atomic_test_and_set(), and free the resources again
1160	//	when that fails...
1161
1162	// Get the vnode for the object
1163	// (note, this does not grab a reference to the node)
1164	if (vfs_lookup_vnode(mountID, vnodeID, &ref->vnode) != B_OK)
1165		goto err1;
1166
1167	// Gets (usually creates) the cache for the node
1168	if (vfs_get_vnode_cache(ref->vnode, &ref->cache, true) != B_OK)
1169		goto err1;
1170
1171	ref->cache->virtual_end = size;
1172	((VMVnodeCache*)ref->cache)->SetFileCacheRef(ref);
1173	return ref;
1174
1175err1:
1176	delete ref;
1177	return NULL;
1178}
1179
1180
1181extern "C" void
1182file_cache_delete(void* _cacheRef)
1183{
1184	file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1185
1186	if (ref == NULL)
1187		return;
1188
1189	TRACE(("file_cache_delete(ref = %p)\n", ref));
1190
1191	ref->cache->ReleaseRef();
1192	delete ref;
1193}
1194
1195
1196extern "C" void
1197file_cache_enable(void* _cacheRef)
1198{
1199	file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1200
1201	AutoLocker<VMCache> _(ref->cache);
1202
1203	if (ref->disabled_count == 0) {
1204		panic("Unbalanced file_cache_enable()!");
1205		return;
1206	}
1207
1208	ref->disabled_count--;
1209}
1210
1211
1212extern "C" status_t
1213file_cache_disable(void* _cacheRef)
1214{
1215	// TODO: This function only removes all pages from the cache and prevents
1216	// that the file cache functions add any new ones until re-enabled. The
1217	// VM (on page fault) can still add pages, if the file is mmap()ed. We
1218	// should mark the cache to prevent shared mappings of the file and fix
1219	// the page fault code to deal correctly with private mappings (i.e. only
1220	// insert pages in consumer caches).
1221
1222	file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1223
1224	AutoLocker<VMCache> _(ref->cache);
1225
1226	// If already disabled, there's nothing to do for us.
1227	if (ref->disabled_count > 0) {
1228		ref->disabled_count++;
1229		return B_OK;
1230	}
1231
1232	// The file cache is not yet disabled. We need to evict all cached pages.
1233	status_t error = ref->cache->FlushAndRemoveAllPages();
1234	if (error != B_OK)
1235		return error;
1236
1237	ref->disabled_count++;
1238	return B_OK;
1239}
1240
1241
1242extern "C" bool
1243file_cache_is_enabled(void* _cacheRef)
1244{
1245	file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1246	AutoLocker<VMCache> _(ref->cache);
1247
1248	return ref->disabled_count == 0;
1249}
1250
1251
1252extern "C" status_t
1253file_cache_set_size(void* _cacheRef, off_t newSize)
1254{
1255	file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1256
1257	TRACE(("file_cache_set_size(ref = %p, size = %lld)\n", ref, newSize));
1258
1259	if (ref == NULL)
1260		return B_OK;
1261
1262	VMCache* cache = ref->cache;
1263	AutoLocker<VMCache> _(cache);
1264
1265	off_t oldSize = cache->virtual_end;
1266	status_t status = cache->Resize(newSize, VM_PRIORITY_USER);
1267		// Note, the priority doesn't really matter, since this cache doesn't
1268		// reserve any memory.
1269	if (status == B_OK && newSize < oldSize) {
1270		// We may have a new partial page at the end of the cache that must be
1271		// cleared.
1272		uint32 partialBytes = newSize % B_PAGE_SIZE;
1273		if (partialBytes != 0) {
1274			vm_page* page = cache->LookupPage(newSize - partialBytes);
1275			if (page != NULL) {
1276				vm_memset_physical(page->physical_page_number * B_PAGE_SIZE
1277					+ partialBytes, 0, B_PAGE_SIZE - partialBytes);
1278			}
1279		}
1280	}
1281
1282	return status;
1283}
1284
1285
1286extern "C" status_t
1287file_cache_sync(void* _cacheRef)
1288{
1289	file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1290	if (ref == NULL)
1291		return B_BAD_VALUE;
1292
1293	return ref->cache->WriteModified();
1294}
1295
1296
1297extern "C" status_t
1298file_cache_read(void* _cacheRef, void* cookie, off_t offset, void* buffer,
1299	size_t* _size)
1300{
1301	file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1302
1303	TRACE(("file_cache_read(ref = %p, offset = %lld, buffer = %p, size = %lu)\n",
1304		ref, offset, buffer, *_size));
1305
1306	// Bounds checking. We do this here so it applies to uncached I/O.
1307	if (offset < 0)
1308		return B_BAD_VALUE;
1309	const off_t fileSize = ref->cache->virtual_end;
1310	if (offset >= fileSize || *_size == 0) {
1311		*_size = 0;
1312		return B_OK;
1313	}
1314	if ((off_t)(offset + *_size) > fileSize)
1315		*_size = fileSize - offset;
1316
1317	if (ref->disabled_count > 0) {
1318		// Caching is disabled -- read directly from the file.
1319		generic_io_vec vec;
1320		vec.base = (addr_t)buffer;
1321		generic_size_t size = vec.length = *_size;
1322		status_t error = vfs_read_pages(ref->vnode, cookie, offset, &vec, 1, 0,
1323			&size);
1324		*_size = size;
1325		return error;
1326	}
1327
1328	return cache_io(ref, cookie, offset, (addr_t)buffer, _size, false);
1329}
1330
1331
1332extern "C" status_t
1333file_cache_write(void* _cacheRef, void* cookie, off_t offset,
1334	const void* buffer, size_t* _size)
1335{
1336	file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1337
1338	// We don't do bounds checking here, as we are relying on the
1339	// file system which called us to already have done that and made
1340	// adjustments as necessary, unlike in read().
1341
1342	if (ref->disabled_count > 0) {
1343		// Caching is disabled -- write directly to the file.
1344		if (buffer != NULL) {
1345			generic_io_vec vec;
1346			vec.base = (addr_t)buffer;
1347			generic_size_t size = vec.length = *_size;
1348
1349			status_t error = vfs_write_pages(ref->vnode, cookie, offset, &vec,
1350				1, 0, &size);
1351			*_size = size;
1352			return error;
1353		}
1354		return write_zeros_to_file(ref->vnode, cookie, offset, _size);
1355	}
1356
1357	status_t status = cache_io(ref, cookie, offset,
1358		(addr_t)const_cast<void*>(buffer), _size, true);
1359
1360	TRACE(("file_cache_write(ref = %p, offset = %lld, buffer = %p, size = %lu)"
1361		" = %ld\n", ref, offset, buffer, *_size, status));
1362
1363	return status;
1364}
1365