1/*
2 * Copyright 2005-2013, Ingo Weinhold, ingo_weinhold@gmx.de.
3 * Copyright 2002-2018, Axel D��rfler, axeld@pinc-software.de.
4 * Distributed under the terms of the MIT License.
5 *
6 * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
7 * Distributed under the terms of the NewOS License.
8 */
9
10
11/*! Virtual File System and File System Interface Layer */
12
13
14#include <ctype.h>
15#include <fcntl.h>
16#include <limits.h>
17#include <stddef.h>
18#include <stdio.h>
19#include <string.h>
20#include <sys/file.h>
21#include <sys/ioctl.h>
22#include <sys/resource.h>
23#include <sys/stat.h>
24#include <unistd.h>
25
26#include <fs_attr.h>
27#include <fs_info.h>
28#include <fs_interface.h>
29#include <fs_volume.h>
30#include <NodeMonitor.h>
31#include <OS.h>
32#include <StorageDefs.h>
33
34#include <AutoDeleter.h>
35#include <AutoDeleterDrivers.h>
36#include <block_cache.h>
37#include <boot/kernel_args.h>
38#include <debug_heap.h>
39#include <disk_device_manager/KDiskDevice.h>
40#include <disk_device_manager/KDiskDeviceManager.h>
41#include <disk_device_manager/KDiskDeviceUtils.h>
42#include <disk_device_manager/KDiskSystem.h>
43#include <fd.h>
44#include <file_cache.h>
45#include <fs/node_monitor.h>
46#include <KPath.h>
47#include <lock.h>
48#include <low_resource_manager.h>
49#include <slab/Slab.h>
50#include <StackOrHeapArray.h>
51#include <syscalls.h>
52#include <syscall_restart.h>
53#include <tracing.h>
54#include <util/atomic.h>
55#include <util/AutoLock.h>
56#include <util/ThreadAutoLock.h>
57#include <util/DoublyLinkedList.h>
58#include <vfs.h>
59#include <vm/vm.h>
60#include <vm/VMCache.h>
61#include <wait_for_objects.h>
62
63#include "EntryCache.h"
64#include "fifo.h"
65#include "IORequest.h"
66#include "unused_vnodes.h"
67#include "vfs_tracing.h"
68#include "Vnode.h"
69#include "../cache/vnode_store.h"
70
71
72//#define TRACE_VFS
73#ifdef TRACE_VFS
74#	define TRACE(x) dprintf x
75#	define FUNCTION(x) dprintf x
76#else
77#	define TRACE(x) ;
78#	define FUNCTION(x) ;
79#endif
80
81#define ADD_DEBUGGER_COMMANDS
82
83
84#define HAS_FS_CALL(vnode, op)			(vnode->ops->op != NULL)
85#define HAS_FS_MOUNT_CALL(mount, op)	(mount->volume->ops->op != NULL)
86
87#if KDEBUG
88#	define FS_CALL(vnode, op, params...) \
89		( HAS_FS_CALL(vnode, op) ? \
90			vnode->ops->op(vnode->mount->volume, vnode, params) \
91			: (panic("FS_CALL: vnode %p op " #op " is NULL", vnode), 0))
92#	define FS_CALL_NO_PARAMS(vnode, op) \
93		( HAS_FS_CALL(vnode, op) ? \
94			vnode->ops->op(vnode->mount->volume, vnode) \
95			: (panic("FS_CALL_NO_PARAMS: vnode %p op " #op " is NULL", vnode), 0))
96#	define FS_MOUNT_CALL(mount, op, params...) \
97		( HAS_FS_MOUNT_CALL(mount, op) ? \
98			mount->volume->ops->op(mount->volume, params) \
99			: (panic("FS_MOUNT_CALL: mount %p op " #op " is NULL", mount), 0))
100#	define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
101		( HAS_FS_MOUNT_CALL(mount, op) ? \
102			mount->volume->ops->op(mount->volume) \
103			: (panic("FS_MOUNT_CALL_NO_PARAMS: mount %p op " #op " is NULL", mount), 0))
104#else
105#	define FS_CALL(vnode, op, params...) \
106			vnode->ops->op(vnode->mount->volume, vnode, params)
107#	define FS_CALL_NO_PARAMS(vnode, op) \
108			vnode->ops->op(vnode->mount->volume, vnode)
109#	define FS_MOUNT_CALL(mount, op, params...) \
110			mount->volume->ops->op(mount->volume, params)
111#	define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
112			mount->volume->ops->op(mount->volume)
113#endif
114
115
116const static size_t kMaxPathLength = 65536;
117	// The absolute maximum path length (for getcwd() - this is not depending
118	// on PATH_MAX
119
120
121typedef DoublyLinkedList<vnode> VnodeList;
122
123/*!	\brief Structure to manage a mounted file system
124
125	Note: The root_vnode and root_vnode->covers fields (what others?) are
126	initialized in fs_mount() and not changed afterwards. That is as soon
127	as the mount is mounted and it is made sure it won't be unmounted
128	(e.g. by holding a reference to a vnode of that mount) (read) access
129	to those fields is always safe, even without additional locking. Morever
130	while mounted the mount holds a reference to the root_vnode->covers vnode,
131	and thus making the access path vnode->mount->root_vnode->covers->mount->...
132	safe if a reference to vnode is held (note that for the root mount
133	root_vnode->covers is NULL, though).
134*/
135struct fs_mount {
136	fs_mount()
137		:
138		volume(NULL),
139		device_name(NULL)
140	{
141		mutex_init(&lock, "mount lock");
142	}
143
144	~fs_mount()
145	{
146		mutex_destroy(&lock);
147		free(device_name);
148
149		while (volume) {
150			fs_volume* superVolume = volume->super_volume;
151
152			if (volume->file_system != NULL)
153				put_module(volume->file_system->info.name);
154
155			free(volume->file_system_name);
156			free(volume);
157			volume = superVolume;
158		}
159	}
160
161	struct fs_mount* next;
162	dev_t			id;
163	fs_volume*		volume;
164	char*			device_name;
165	mutex			lock;	// guards the vnodes list
166	struct vnode*	root_vnode;
167	struct vnode*	covers_vnode;	// immutable
168	KPartition*		partition;
169	VnodeList		vnodes;
170	EntryCache		entry_cache;
171	bool			unmounting;
172	bool			owns_file_device;
173};
174
175
176namespace {
177
178struct advisory_lock : public DoublyLinkedListLinkImpl<advisory_lock> {
179	list_link		link;
180	void*			bound_to;
181	team_id			team;
182	pid_t			session;
183	off_t			start;
184	off_t			end;
185	bool			shared;
186};
187
188typedef DoublyLinkedList<advisory_lock> LockList;
189
190} // namespace
191
192
193struct advisory_locking {
194	sem_id			lock;
195	sem_id			wait_sem;
196	LockList		locks;
197
198	advisory_locking()
199		:
200		lock(-1),
201		wait_sem(-1)
202	{
203	}
204
205	~advisory_locking()
206	{
207		if (lock >= 0)
208			delete_sem(lock);
209		if (wait_sem >= 0)
210			delete_sem(wait_sem);
211	}
212};
213
214/*!	\brief Guards sMountsTable.
215
216	The holder is allowed to read/write access the sMountsTable.
217	Manipulation of the fs_mount structures themselves
218	(and their destruction) requires different locks though.
219*/
220static rw_lock sMountLock = RW_LOCK_INITIALIZER("vfs_mount_lock");
221
222/*!	\brief Guards mount/unmount operations.
223
224	The fs_mount() and fs_unmount() hold the lock during their whole operation.
225	That is locking the lock ensures that no FS is mounted/unmounted. In
226	particular this means that
227	- sMountsTable will not be modified,
228	- the fields immutable after initialization of the fs_mount structures in
229	  sMountsTable will not be modified,
230
231	The thread trying to lock the lock must not hold sVnodeLock or
232	sMountLock.
233*/
234static recursive_lock sMountOpLock;
235
236/*!	\brief Guards sVnodeTable.
237
238	The holder is allowed read/write access to sVnodeTable and to
239	any unbusy vnode in that table, save to the immutable fields (device, id,
240	private_node, mount) to which only read-only access is allowed.
241	The mutable fields advisory_locking, mandatory_locked_by, and ref_count, as
242	well as the busy, removed, unused flags, and the vnode's type can also be
243	write accessed when holding a read lock to sVnodeLock *and* having the vnode
244	locked. Write access to covered_by and covers requires to write lock
245	sVnodeLock.
246
247	The thread trying to acquire the lock must not hold sMountLock.
248	You must not hold this lock when calling create_sem(), as this might call
249	vfs_free_unused_vnodes() and thus cause a deadlock.
250*/
251static rw_lock sVnodeLock = RW_LOCK_INITIALIZER("vfs_vnode_lock");
252
253/*!	\brief Guards io_context::root.
254
255	Must be held when setting or getting the io_context::root field.
256	The only operation allowed while holding this lock besides getting or
257	setting the field is inc_vnode_ref_count() on io_context::root.
258*/
259static mutex sIOContextRootLock = MUTEX_INITIALIZER("io_context::root lock");
260
261
262namespace {
263
264struct vnode_hash_key {
265	dev_t	device;
266	ino_t	vnode;
267};
268
269struct VnodeHash {
270	typedef vnode_hash_key	KeyType;
271	typedef	struct vnode	ValueType;
272
273#define VHASH(mountid, vnodeid) \
274	(((uint32)((vnodeid) >> 32) + (uint32)(vnodeid)) ^ (uint32)(mountid))
275
276	size_t HashKey(KeyType key) const
277	{
278		return VHASH(key.device, key.vnode);
279	}
280
281	size_t Hash(ValueType* vnode) const
282	{
283		return VHASH(vnode->device, vnode->id);
284	}
285
286#undef VHASH
287
288	bool Compare(KeyType key, ValueType* vnode) const
289	{
290		return vnode->device == key.device && vnode->id == key.vnode;
291	}
292
293	ValueType*& GetLink(ValueType* value) const
294	{
295		return value->next;
296	}
297};
298
299typedef BOpenHashTable<VnodeHash> VnodeTable;
300
301
302struct MountHash {
303	typedef dev_t			KeyType;
304	typedef	struct fs_mount	ValueType;
305
306	size_t HashKey(KeyType key) const
307	{
308		return key;
309	}
310
311	size_t Hash(ValueType* mount) const
312	{
313		return mount->id;
314	}
315
316	bool Compare(KeyType key, ValueType* mount) const
317	{
318		return mount->id == key;
319	}
320
321	ValueType*& GetLink(ValueType* value) const
322	{
323		return value->next;
324	}
325};
326
327typedef BOpenHashTable<MountHash> MountTable;
328
329} // namespace
330
331
332object_cache* sPathNameCache;
333object_cache* sVnodeCache;
334object_cache* sFileDescriptorCache;
335
336#define VNODE_HASH_TABLE_SIZE 1024
337static VnodeTable* sVnodeTable;
338static struct vnode* sRoot;
339
340#define MOUNTS_HASH_TABLE_SIZE 16
341static MountTable* sMountsTable;
342static dev_t sNextMountID = 1;
343
344#define MAX_TEMP_IO_VECS 8
345
346// How long to wait for busy vnodes (10s)
347#define BUSY_VNODE_RETRIES 2000
348#define BUSY_VNODE_DELAY 5000
349
350mode_t __gUmask = 022;
351
352/* function declarations */
353
354static void free_unused_vnodes();
355
356// file descriptor operation prototypes
357static status_t file_read(struct file_descriptor* descriptor, off_t pos,
358	void* buffer, size_t* _bytes);
359static status_t file_write(struct file_descriptor* descriptor, off_t pos,
360	const void* buffer, size_t* _bytes);
361static ssize_t file_readv(struct file_descriptor* descriptor, off_t pos,
362	const struct iovec *vecs, int count);
363static ssize_t file_writev(struct file_descriptor* descriptor, off_t pos,
364	const struct iovec *vecs, int count);
365static off_t file_seek(struct file_descriptor* descriptor, off_t pos,
366	int seekType);
367static void file_free_fd(struct file_descriptor* descriptor);
368static status_t file_close(struct file_descriptor* descriptor);
369static status_t file_select(struct file_descriptor* descriptor, uint8 event,
370	struct selectsync* sync);
371static status_t file_deselect(struct file_descriptor* descriptor, uint8 event,
372	struct selectsync* sync);
373static status_t dir_read(struct io_context* context,
374	struct file_descriptor* descriptor, struct dirent* buffer,
375	size_t bufferSize, uint32* _count);
376static status_t dir_read(struct io_context* ioContext, struct vnode* vnode,
377	void* cookie, struct dirent* buffer, size_t bufferSize, uint32* _count);
378static status_t dir_rewind(struct file_descriptor* descriptor);
379static void dir_free_fd(struct file_descriptor* descriptor);
380static status_t dir_close(struct file_descriptor* descriptor);
381static status_t attr_dir_read(struct io_context* context,
382	struct file_descriptor* descriptor, struct dirent* buffer,
383	size_t bufferSize, uint32* _count);
384static status_t attr_dir_rewind(struct file_descriptor* descriptor);
385static void attr_dir_free_fd(struct file_descriptor* descriptor);
386static status_t attr_dir_close(struct file_descriptor* descriptor);
387static status_t attr_read(struct file_descriptor* descriptor, off_t pos,
388	void* buffer, size_t* _bytes);
389static status_t attr_write(struct file_descriptor* descriptor, off_t pos,
390	const void* buffer, size_t* _bytes);
391static off_t attr_seek(struct file_descriptor* descriptor, off_t pos,
392	int seekType);
393static void attr_free_fd(struct file_descriptor* descriptor);
394static status_t attr_close(struct file_descriptor* descriptor);
395static status_t attr_read_stat(struct file_descriptor* descriptor,
396	struct stat* statData);
397static status_t attr_write_stat(struct file_descriptor* descriptor,
398	const struct stat* stat, int statMask);
399static status_t index_dir_read(struct io_context* context,
400	struct file_descriptor* descriptor, struct dirent* buffer,
401	size_t bufferSize, uint32* _count);
402static status_t index_dir_rewind(struct file_descriptor* descriptor);
403static void index_dir_free_fd(struct file_descriptor* descriptor);
404static status_t index_dir_close(struct file_descriptor* descriptor);
405static status_t query_read(struct io_context* context,
406	struct file_descriptor* descriptor, struct dirent* buffer,
407	size_t bufferSize, uint32* _count);
408static status_t query_rewind(struct file_descriptor* descriptor);
409static void query_free_fd(struct file_descriptor* descriptor);
410static status_t query_close(struct file_descriptor* descriptor);
411
412static status_t common_ioctl(struct file_descriptor* descriptor, ulong op,
413	void* buffer, size_t length);
414static status_t common_read_stat(struct file_descriptor* descriptor,
415	struct stat* statData);
416static status_t common_write_stat(struct file_descriptor* descriptor,
417	const struct stat* statData, int statMask);
418static status_t common_path_read_stat(int fd, char* path, bool traverseLeafLink,
419	struct stat* stat, bool kernel);
420
421static status_t vnode_path_to_vnode(struct vnode* vnode, char* path,
422	bool traverseLeafLink, bool kernel,
423	VnodePutter& _vnode, ino_t* _parentID, char* leafName = NULL);
424static status_t dir_vnode_to_path(struct vnode* vnode, char* buffer,
425	size_t bufferSize, bool kernel);
426static status_t fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
427	VnodePutter& _vnode, ino_t* _parentID, bool kernel);
428static void inc_vnode_ref_count(struct vnode* vnode);
429static status_t dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree,
430	bool reenter);
431static inline void put_vnode(struct vnode* vnode);
432static status_t fs_unmount(char* path, dev_t mountID, uint32 flags,
433	bool kernel);
434static int open_vnode(struct vnode* vnode, int openMode, bool kernel);
435
436
437static struct fd_ops sFileOps = {
438	file_close,
439	file_free_fd,
440	file_read,
441	file_write,
442	file_readv,
443	file_writev,
444	file_seek,
445	common_ioctl,
446	NULL,		// set_flags()
447	file_select,
448	file_deselect,
449	NULL,		// read_dir()
450	NULL,		// rewind_dir()
451	common_read_stat,
452	common_write_stat,
453};
454
455static struct fd_ops sDirectoryOps = {
456	dir_close,
457	dir_free_fd,
458	NULL, NULL,	// read(), write()
459	NULL, NULL,	// readv(), writev()
460	NULL,		// seek()
461	common_ioctl,
462	NULL,		// set_flags
463	NULL,		// select()
464	NULL,		// deselect()
465	dir_read,
466	dir_rewind,
467	common_read_stat,
468	common_write_stat,
469};
470
471static struct fd_ops sAttributeDirectoryOps = {
472	attr_dir_close,
473	attr_dir_free_fd,
474	NULL, NULL,	// read(), write()
475	NULL, NULL,	// readv(), writev()
476	NULL,		// seek()
477	common_ioctl,
478	NULL,		// set_flags
479	NULL,		// select()
480	NULL,		// deselect()
481	attr_dir_read,
482	attr_dir_rewind,
483	common_read_stat,
484	common_write_stat,
485};
486
487static struct fd_ops sAttributeOps = {
488	attr_close,
489	attr_free_fd,
490	attr_read,
491	attr_write,
492	NULL,		// readv()
493	NULL,		// writev()
494	attr_seek,
495	common_ioctl,
496	NULL,		// set_flags()
497	NULL,		// select()
498	NULL,		// deselect()
499	NULL,		// read_dir()
500	NULL,		// rewind_dir()
501	attr_read_stat,
502	attr_write_stat,
503};
504
505static struct fd_ops sIndexDirectoryOps = {
506	index_dir_close,
507	index_dir_free_fd,
508	NULL, NULL,	// read(), write()
509	NULL, NULL,	// readv(), writev()
510	NULL,		// seek()
511	NULL,		// ioctl()
512	NULL,		// set_flags()
513	NULL,		// select()
514	NULL,		// deselect()
515	index_dir_read,
516	index_dir_rewind,
517	NULL,		// read_stat()
518	NULL,		// write_stat()
519};
520
521#if 0
522static struct fd_ops sIndexOps = {
523	NULL,		// dir_close()
524	NULL,		// free_fd()
525	NULL, NULL,	// read(), write()
526	NULL, NULL,	// readv(), writev()
527	NULL,		// seek()
528	NULL,		// ioctl()
529	NULL,		// set_flags
530	NULL,		// select()
531	NULL,		// deselect()
532	NULL,		// dir_read()
533	NULL,		// dir_rewind()
534	index_read_stat,	// read_stat()
535	NULL,		// write_stat()
536};
537#endif
538
539static struct fd_ops sQueryOps = {
540	query_close,
541	query_free_fd,
542	NULL, NULL,	// read(), write()
543	NULL, NULL,	// readv(), writev()
544	NULL,		// seek()
545	NULL,		// ioctl()
546	NULL,		// set_flags()
547	NULL,		// select()
548	NULL,		// deselect()
549	query_read,
550	query_rewind,
551	NULL,		// read_stat()
552	NULL,		// write_stat()
553};
554
555
556namespace {
557
558class FDCloser {
559public:
560	FDCloser() : fFD(-1), fKernel(true) {}
561
562	FDCloser(int fd, bool kernel) : fFD(fd), fKernel(kernel) {}
563
564	~FDCloser()
565	{
566		Close();
567	}
568
569	void SetTo(int fd, bool kernel)
570	{
571		Close();
572		fFD = fd;
573		fKernel = kernel;
574	}
575
576	void Close()
577	{
578		if (fFD >= 0) {
579			if (fKernel)
580				_kern_close(fFD);
581			else
582				_user_close(fFD);
583			fFD = -1;
584		}
585	}
586
587	int Detach()
588	{
589		int fd = fFD;
590		fFD = -1;
591		return fd;
592	}
593
594private:
595	int		fFD;
596	bool	fKernel;
597};
598
599} // namespace
600
601
602#if VFS_PAGES_IO_TRACING
603
604namespace VFSPagesIOTracing {
605
606class PagesIOTraceEntry : public AbstractTraceEntry {
607protected:
608	PagesIOTraceEntry(struct vnode* vnode, void* cookie, off_t pos,
609		const generic_io_vec* vecs, uint32 count, uint32 flags,
610		generic_size_t bytesRequested, status_t status,
611		generic_size_t bytesTransferred)
612		:
613		fVnode(vnode),
614		fMountID(vnode->mount->id),
615		fNodeID(vnode->id),
616		fCookie(cookie),
617		fPos(pos),
618		fCount(count),
619		fFlags(flags),
620		fBytesRequested(bytesRequested),
621		fStatus(status),
622		fBytesTransferred(bytesTransferred)
623	{
624		fVecs = (generic_io_vec*)alloc_tracing_buffer_memcpy(vecs,
625			sizeof(generic_io_vec) * count, false);
626	}
627
628	void AddDump(TraceOutput& out, const char* mode)
629	{
630		out.Print("vfs pages io %5s: vnode: %p (%" B_PRId32 ", %" B_PRId64 "), "
631			"cookie: %p, pos: %" B_PRIdOFF ", size: %" B_PRIu64 ", vecs: {",
632			mode, fVnode, fMountID, fNodeID, fCookie, fPos,
633			(uint64)fBytesRequested);
634
635		if (fVecs != NULL) {
636			for (uint32 i = 0; i < fCount; i++) {
637				if (i > 0)
638					out.Print(", ");
639				out.Print("(%" B_PRIx64 ", %" B_PRIu64 ")", (uint64)fVecs[i].base,
640					(uint64)fVecs[i].length);
641			}
642		}
643
644		out.Print("}, flags: %#" B_PRIx32 " -> status: %#" B_PRIx32 ", "
645			"transferred: %" B_PRIu64, fFlags, fStatus,
646			(uint64)fBytesTransferred);
647	}
648
649protected:
650	struct vnode*	fVnode;
651	dev_t			fMountID;
652	ino_t			fNodeID;
653	void*			fCookie;
654	off_t			fPos;
655	generic_io_vec*	fVecs;
656	uint32			fCount;
657	uint32			fFlags;
658	generic_size_t	fBytesRequested;
659	status_t		fStatus;
660	generic_size_t	fBytesTransferred;
661};
662
663
664class ReadPages : public PagesIOTraceEntry {
665public:
666	ReadPages(struct vnode* vnode, void* cookie, off_t pos,
667		const generic_io_vec* vecs, uint32 count, uint32 flags,
668		generic_size_t bytesRequested, status_t status,
669		generic_size_t bytesTransferred)
670		:
671		PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
672			bytesRequested, status, bytesTransferred)
673	{
674		Initialized();
675	}
676
677	virtual void AddDump(TraceOutput& out)
678	{
679		PagesIOTraceEntry::AddDump(out, "read");
680	}
681};
682
683
684class WritePages : public PagesIOTraceEntry {
685public:
686	WritePages(struct vnode* vnode, void* cookie, off_t pos,
687		const generic_io_vec* vecs, uint32 count, uint32 flags,
688		generic_size_t bytesRequested, status_t status,
689		generic_size_t bytesTransferred)
690		:
691		PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
692			bytesRequested, status, bytesTransferred)
693	{
694		Initialized();
695	}
696
697	virtual void AddDump(TraceOutput& out)
698	{
699		PagesIOTraceEntry::AddDump(out, "write");
700	}
701};
702
703}	// namespace VFSPagesIOTracing
704
705#	define TPIO(x) new(std::nothrow) VFSPagesIOTracing::x;
706#else
707#	define TPIO(x) ;
708#endif	// VFS_PAGES_IO_TRACING
709
710
711/*! Finds the mounted device (the fs_mount structure) with the given ID.
712	Note, you must hold the sMountLock lock when you call this function.
713*/
714static struct fs_mount*
715find_mount(dev_t id)
716{
717	ASSERT_READ_LOCKED_RW_LOCK(&sMountLock);
718
719	return sMountsTable->Lookup(id);
720}
721
722
723static status_t
724get_mount(dev_t id, struct fs_mount** _mount)
725{
726	struct fs_mount* mount;
727
728	ReadLocker nodeLocker(sVnodeLock);
729	ReadLocker mountLocker(sMountLock);
730
731	mount = find_mount(id);
732	if (mount == NULL)
733		return B_BAD_VALUE;
734
735	struct vnode* rootNode = mount->root_vnode;
736	if (mount->unmounting || rootNode == NULL || rootNode->IsBusy()
737		|| rootNode->ref_count == 0) {
738		// might have been called during a mount/unmount operation
739		return B_BUSY;
740	}
741
742	inc_vnode_ref_count(rootNode);
743	*_mount = mount;
744	return B_OK;
745}
746
747
748static void
749put_mount(struct fs_mount* mount)
750{
751	if (mount)
752		put_vnode(mount->root_vnode);
753}
754
755
756/*!	Tries to open the specified file system module.
757	Accepts a file system name of the form "bfs" or "file_systems/bfs/v1".
758	Returns a pointer to file system module interface, or NULL if it
759	could not open the module.
760*/
761static file_system_module_info*
762get_file_system(const char* fsName)
763{
764	char name[B_FILE_NAME_LENGTH];
765	if (strncmp(fsName, "file_systems/", strlen("file_systems/"))) {
766		// construct module name if we didn't get one
767		// (we currently support only one API)
768		snprintf(name, sizeof(name), "file_systems/%s/v1", fsName);
769		fsName = NULL;
770	}
771
772	file_system_module_info* info;
773	if (get_module(fsName ? fsName : name, (module_info**)&info) != B_OK)
774		return NULL;
775
776	return info;
777}
778
779
780/*!	Accepts a file system name of the form "bfs" or "file_systems/bfs/v1"
781	and returns a compatible fs_info.fsh_name name ("bfs" in both cases).
782	The name is allocated for you, and you have to free() it when you're
783	done with it.
784	Returns NULL if the required memory is not available.
785*/
786static char*
787get_file_system_name(const char* fsName)
788{
789	const size_t length = strlen("file_systems/");
790
791	if (strncmp(fsName, "file_systems/", length)) {
792		// the name already seems to be the module's file name
793		return strdup(fsName);
794	}
795
796	fsName += length;
797	const char* end = strchr(fsName, '/');
798	if (end == NULL) {
799		// this doesn't seem to be a valid name, but well...
800		return strdup(fsName);
801	}
802
803	// cut off the trailing /v1
804
805	char* name = (char*)malloc(end + 1 - fsName);
806	if (name == NULL)
807		return NULL;
808
809	strlcpy(name, fsName, end + 1 - fsName);
810	return name;
811}
812
813
814/*!	Accepts a list of file system names separated by a colon, one for each
815	layer and returns the file system name for the specified layer.
816	The name is allocated for you, and you have to free() it when you're
817	done with it.
818	Returns NULL if the required memory is not available or if there is no
819	name for the specified layer.
820*/
821static char*
822get_file_system_name_for_layer(const char* fsNames, int32 layer)
823{
824	while (layer >= 0) {
825		const char* end = strchr(fsNames, ':');
826		if (end == NULL) {
827			if (layer == 0)
828				return strdup(fsNames);
829			return NULL;
830		}
831
832		if (layer == 0) {
833			size_t length = end - fsNames + 1;
834			char* result = (char*)malloc(length);
835			strlcpy(result, fsNames, length);
836			return result;
837		}
838
839		fsNames = end + 1;
840		layer--;
841	}
842
843	return NULL;
844}
845
846
847static void
848add_vnode_to_mount_list(struct vnode* vnode, struct fs_mount* mount)
849{
850	MutexLocker _(mount->lock);
851	mount->vnodes.Add(vnode);
852}
853
854
855static void
856remove_vnode_from_mount_list(struct vnode* vnode, struct fs_mount* mount)
857{
858	MutexLocker _(mount->lock);
859	mount->vnodes.Remove(vnode);
860}
861
862
863/*!	\brief Looks up a vnode by mount and node ID in the sVnodeTable.
864
865	The caller must hold the sVnodeLock (read lock at least).
866
867	\param mountID the mount ID.
868	\param vnodeID the node ID.
869
870	\return The vnode structure, if it was found in the hash table, \c NULL
871			otherwise.
872*/
873static struct vnode*
874lookup_vnode(dev_t mountID, ino_t vnodeID)
875{
876	ASSERT_READ_LOCKED_RW_LOCK(&sVnodeLock);
877
878	struct vnode_hash_key key;
879
880	key.device = mountID;
881	key.vnode = vnodeID;
882
883	return sVnodeTable->Lookup(key);
884}
885
886
887/*!	\brief Checks whether or not a busy vnode should be waited for (again).
888
889	This will also wait for BUSY_VNODE_DELAY before returning if one should
890	still wait for the vnode becoming unbusy.
891
892	\return \c true if one should retry, \c false if not.
893*/
894static bool
895retry_busy_vnode(int32& tries, dev_t mountID, ino_t vnodeID)
896{
897	if (--tries < 0) {
898		// vnode doesn't seem to become unbusy
899		dprintf("vnode %" B_PRIdDEV ":%" B_PRIdINO
900			" is not becoming unbusy!\n", mountID, vnodeID);
901		return false;
902	}
903	snooze(BUSY_VNODE_DELAY);
904	return true;
905}
906
907
908/*!	Creates a new vnode with the given mount and node ID.
909	If the node already exists, it is returned instead and no new node is
910	created. In either case -- but not, if an error occurs -- the function write
911	locks \c sVnodeLock and keeps it locked for the caller when returning. On
912	error the lock is not held on return.
913
914	\param mountID The mount ID.
915	\param vnodeID The vnode ID.
916	\param _vnode Will be set to the new vnode on success.
917	\param _nodeCreated Will be set to \c true when the returned vnode has
918		been newly created, \c false when it already existed. Will not be
919		changed on error.
920	\return \c B_OK, when the vnode was successfully created and inserted or
921		a node with the given ID was found, \c B_NO_MEMORY or
922		\c B_ENTRY_NOT_FOUND on error.
923*/
924static status_t
925create_new_vnode_and_lock(dev_t mountID, ino_t vnodeID, struct vnode*& _vnode,
926	bool& _nodeCreated)
927{
928	FUNCTION(("create_new_vnode_and_lock()\n"));
929
930	struct vnode* vnode = (struct vnode*)object_cache_alloc(sVnodeCache, 0);
931	if (vnode == NULL)
932		return B_NO_MEMORY;
933
934	// initialize basic values
935	memset(vnode, 0, sizeof(struct vnode));
936	vnode->device = mountID;
937	vnode->id = vnodeID;
938	vnode->ref_count = 1;
939	vnode->SetBusy(true);
940
941	// look up the node -- it might have been added by someone else in the
942	// meantime
943	rw_lock_write_lock(&sVnodeLock);
944	struct vnode* existingVnode = lookup_vnode(mountID, vnodeID);
945	if (existingVnode != NULL) {
946		object_cache_free(sVnodeCache, vnode, 0);
947		_vnode = existingVnode;
948		_nodeCreated = false;
949		return B_OK;
950	}
951
952	// get the mount structure
953	rw_lock_read_lock(&sMountLock);
954	vnode->mount = find_mount(mountID);
955	if (!vnode->mount || vnode->mount->unmounting) {
956		rw_lock_read_unlock(&sMountLock);
957		rw_lock_write_unlock(&sVnodeLock);
958		object_cache_free(sVnodeCache, vnode, 0);
959		return B_ENTRY_NOT_FOUND;
960	}
961
962	// add the vnode to the mount's node list and the hash table
963	sVnodeTable->Insert(vnode);
964	add_vnode_to_mount_list(vnode, vnode->mount);
965
966	rw_lock_read_unlock(&sMountLock);
967
968	_vnode = vnode;
969	_nodeCreated = true;
970
971	// keep the vnode lock locked
972	return B_OK;
973}
974
975
976/*!	Frees the vnode and all resources it has acquired, and removes
977	it from the vnode hash as well as from its mount structure.
978	Will also make sure that any cache modifications are written back.
979*/
980static void
981free_vnode(struct vnode* vnode, bool reenter)
982{
983	ASSERT_PRINT(vnode->ref_count == 0 && vnode->IsBusy(), "vnode: %p\n",
984		vnode);
985	ASSERT_PRINT(vnode->advisory_locking == NULL, "vnode: %p\n", vnode);
986
987	// write back any changes in this vnode's cache -- but only
988	// if the vnode won't be deleted, in which case the changes
989	// will be discarded
990
991	if (!vnode->IsRemoved() && HAS_FS_CALL(vnode, fsync))
992		FS_CALL_NO_PARAMS(vnode, fsync);
993
994	// Note: If this vnode has a cache attached, there will still be two
995	// references to that cache at this point. The last one belongs to the vnode
996	// itself (cf. vfs_get_vnode_cache()) and one belongs to the node's file
997	// cache. Each but the last reference to a cache also includes a reference
998	// to the vnode. The file cache, however, released its reference (cf.
999	// file_cache_create()), so that this vnode's ref count has the chance to
1000	// ever drop to 0. Deleting the file cache now, will cause the next to last
1001	// cache reference to be released, which will also release a (no longer
1002	// existing) vnode reference. To avoid problems, we set the vnode's ref
1003	// count, so that it will neither become negative nor 0.
1004	vnode->ref_count = 2;
1005
1006	if (!vnode->IsUnpublished()) {
1007		if (vnode->IsRemoved())
1008			FS_CALL(vnode, remove_vnode, reenter);
1009		else
1010			FS_CALL(vnode, put_vnode, reenter);
1011	}
1012
1013	// If the vnode has a VMCache attached, make sure that it won't try to get
1014	// another reference via VMVnodeCache::AcquireUnreferencedStoreRef(). As
1015	// long as the vnode is busy and in the hash, that won't happen, but as
1016	// soon as we've removed it from the hash, it could reload the vnode -- with
1017	// a new cache attached!
1018	if (vnode->cache != NULL && vnode->cache->type == CACHE_TYPE_VNODE)
1019		((VMVnodeCache*)vnode->cache)->VnodeDeleted();
1020
1021	// The file system has removed the resources of the vnode now, so we can
1022	// make it available again (by removing the busy vnode from the hash).
1023	rw_lock_write_lock(&sVnodeLock);
1024	sVnodeTable->Remove(vnode);
1025	rw_lock_write_unlock(&sVnodeLock);
1026
1027	// if we have a VMCache attached, remove it
1028	if (vnode->cache)
1029		vnode->cache->ReleaseRef();
1030
1031	vnode->cache = NULL;
1032
1033	remove_vnode_from_mount_list(vnode, vnode->mount);
1034
1035	object_cache_free(sVnodeCache, vnode, 0);
1036}
1037
1038
1039/*!	\brief Decrements the reference counter of the given vnode and deletes it,
1040	if the counter dropped to 0.
1041
1042	The caller must, of course, own a reference to the vnode to call this
1043	function.
1044	The caller must not hold the sVnodeLock or the sMountLock.
1045
1046	\param vnode the vnode.
1047	\param alwaysFree don't move this vnode into the unused list, but really
1048		   delete it if possible.
1049	\param reenter \c true, if this function is called (indirectly) from within
1050		   a file system. This will be passed to file system hooks only.
1051	\return \c B_OK, if everything went fine, an error code otherwise.
1052*/
1053static status_t
1054dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree, bool reenter)
1055{
1056	ReadLocker locker(sVnodeLock);
1057	AutoLocker<Vnode> nodeLocker(vnode);
1058
1059	int32 oldRefCount = atomic_add(&vnode->ref_count, -1);
1060
1061	ASSERT_PRINT(oldRefCount > 0, "vnode %p\n", vnode);
1062
1063	TRACE(("dec_vnode_ref_count: vnode %p, ref now %" B_PRId32 "\n", vnode,
1064		vnode->ref_count));
1065
1066	if (oldRefCount != 1)
1067		return B_OK;
1068
1069	if (vnode->IsBusy())
1070		panic("dec_vnode_ref_count: called on busy vnode %p\n", vnode);
1071
1072	bool freeNode = false;
1073	bool freeUnusedNodes = false;
1074
1075	// Just insert the vnode into an unused list if we don't need
1076	// to delete it
1077	if (vnode->IsRemoved() || alwaysFree) {
1078		vnode_to_be_freed(vnode);
1079		vnode->SetBusy(true);
1080		freeNode = true;
1081	} else
1082		freeUnusedNodes = vnode_unused(vnode);
1083
1084	nodeLocker.Unlock();
1085	locker.Unlock();
1086
1087	if (freeNode)
1088		free_vnode(vnode, reenter);
1089	else if (freeUnusedNodes)
1090		free_unused_vnodes();
1091
1092	return B_OK;
1093}
1094
1095
1096/*!	\brief Increments the reference counter of the given vnode.
1097
1098	The caller must make sure that the node isn't deleted while this function
1099	is called. This can be done either:
1100	- by ensuring that a reference to the node exists and remains in existence,
1101	  or
1102	- by holding the vnode's lock (which also requires read locking sVnodeLock)
1103	  or by holding sVnodeLock write locked.
1104
1105	In the second case the caller is responsible for dealing with the ref count
1106	0 -> 1 transition. That is 1. this function must not be invoked when the
1107	node is busy in the first place and 2. vnode_used() must be called for the
1108	node.
1109
1110	\param vnode the vnode.
1111*/
1112static void
1113inc_vnode_ref_count(struct vnode* vnode)
1114{
1115	atomic_add(&vnode->ref_count, 1);
1116	TRACE(("inc_vnode_ref_count: vnode %p, ref now %" B_PRId32 "\n", vnode,
1117		vnode->ref_count));
1118}
1119
1120
1121static bool
1122is_special_node_type(int type)
1123{
1124	// at the moment only FIFOs are supported
1125	return S_ISFIFO(type);
1126}
1127
1128
1129static status_t
1130create_special_sub_node(struct vnode* vnode, uint32 flags)
1131{
1132	if (S_ISFIFO(vnode->Type()))
1133		return create_fifo_vnode(vnode->mount->volume, vnode);
1134
1135	return B_BAD_VALUE;
1136}
1137
1138
1139/*!	\brief Retrieves a vnode for a given mount ID, node ID pair.
1140
1141	If the node is not yet in memory, it will be loaded.
1142
1143	The caller must not hold the sVnodeLock or the sMountLock.
1144
1145	\param mountID the mount ID.
1146	\param vnodeID the node ID.
1147	\param _vnode Pointer to a vnode* variable into which the pointer to the
1148		   retrieved vnode structure shall be written.
1149	\param reenter \c true, if this function is called (indirectly) from within
1150		   a file system.
1151	\return \c B_OK, if everything when fine, an error code otherwise.
1152*/
1153static status_t
1154get_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode, bool canWait,
1155	int reenter)
1156{
1157	FUNCTION(("get_vnode: mountid %" B_PRId32 " vnid 0x%" B_PRIx64 " %p\n",
1158		mountID, vnodeID, _vnode));
1159
1160	rw_lock_read_lock(&sVnodeLock);
1161
1162	int32 tries = BUSY_VNODE_RETRIES;
1163restart:
1164	struct vnode* vnode = lookup_vnode(mountID, vnodeID);
1165	AutoLocker<Vnode> nodeLocker(vnode);
1166
1167	if (vnode && vnode->IsBusy()) {
1168		// vnodes in the Removed state (except ones still Unpublished)
1169		// which are also Busy will disappear soon, so we do not wait for them.
1170		const bool doNotWait = vnode->IsRemoved() && !vnode->IsUnpublished();
1171
1172		nodeLocker.Unlock();
1173		rw_lock_read_unlock(&sVnodeLock);
1174		if (!canWait) {
1175			dprintf("vnode %" B_PRIdDEV ":%" B_PRIdINO " is busy!\n",
1176				mountID, vnodeID);
1177			return B_BUSY;
1178		}
1179		if (doNotWait || !retry_busy_vnode(tries, mountID, vnodeID))
1180			return B_BUSY;
1181
1182		rw_lock_read_lock(&sVnodeLock);
1183		goto restart;
1184	}
1185
1186	TRACE(("get_vnode: tried to lookup vnode, got %p\n", vnode));
1187
1188	status_t status;
1189
1190	if (vnode) {
1191		if (vnode->ref_count == 0) {
1192			// this vnode has been unused before
1193			vnode_used(vnode);
1194		}
1195		inc_vnode_ref_count(vnode);
1196
1197		nodeLocker.Unlock();
1198		rw_lock_read_unlock(&sVnodeLock);
1199	} else {
1200		// we need to create a new vnode and read it in
1201		rw_lock_read_unlock(&sVnodeLock);
1202			// unlock -- create_new_vnode_and_lock() write-locks on success
1203		bool nodeCreated;
1204		status = create_new_vnode_and_lock(mountID, vnodeID, vnode,
1205			nodeCreated);
1206		if (status != B_OK)
1207			return status;
1208
1209		if (!nodeCreated) {
1210			rw_lock_read_lock(&sVnodeLock);
1211			rw_lock_write_unlock(&sVnodeLock);
1212			goto restart;
1213		}
1214
1215		rw_lock_write_unlock(&sVnodeLock);
1216
1217		int type;
1218		uint32 flags;
1219		status = FS_MOUNT_CALL(vnode->mount, get_vnode, vnodeID, vnode, &type,
1220			&flags, reenter);
1221		if (status == B_OK && vnode->private_node == NULL)
1222			status = B_BAD_VALUE;
1223
1224		bool gotNode = status == B_OK;
1225		bool publishSpecialSubNode = false;
1226		if (gotNode) {
1227			vnode->SetType(type);
1228			publishSpecialSubNode = is_special_node_type(type)
1229				&& (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
1230		}
1231
1232		if (gotNode && publishSpecialSubNode)
1233			status = create_special_sub_node(vnode, flags);
1234
1235		if (status != B_OK) {
1236			if (gotNode)
1237				FS_CALL(vnode, put_vnode, reenter);
1238
1239			rw_lock_write_lock(&sVnodeLock);
1240			sVnodeTable->Remove(vnode);
1241			remove_vnode_from_mount_list(vnode, vnode->mount);
1242			rw_lock_write_unlock(&sVnodeLock);
1243
1244			object_cache_free(sVnodeCache, vnode, 0);
1245			return status;
1246		}
1247
1248		rw_lock_read_lock(&sVnodeLock);
1249		vnode->Lock();
1250
1251		vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
1252		vnode->SetBusy(false);
1253
1254		vnode->Unlock();
1255		rw_lock_read_unlock(&sVnodeLock);
1256	}
1257
1258	TRACE(("get_vnode: returning %p\n", vnode));
1259
1260	*_vnode = vnode;
1261	return B_OK;
1262}
1263
1264
1265/*!	\brief Decrements the reference counter of the given vnode and deletes it,
1266	if the counter dropped to 0.
1267
1268	The caller must, of course, own a reference to the vnode to call this
1269	function.
1270	The caller must not hold the sVnodeLock or the sMountLock.
1271
1272	\param vnode the vnode.
1273*/
1274static inline void
1275put_vnode(struct vnode* vnode)
1276{
1277	dec_vnode_ref_count(vnode, false, false);
1278}
1279
1280
1281static void
1282free_unused_vnodes(int32 level)
1283{
1284	unused_vnodes_check_started();
1285
1286	if (level == B_NO_LOW_RESOURCE) {
1287		unused_vnodes_check_done();
1288		return;
1289	}
1290
1291	flush_hot_vnodes();
1292
1293	// determine how many nodes to free
1294	uint32 count = 1;
1295	{
1296		MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1297
1298		switch (level) {
1299			case B_LOW_RESOURCE_NOTE:
1300				count = sUnusedVnodes / 100;
1301				break;
1302			case B_LOW_RESOURCE_WARNING:
1303				count = sUnusedVnodes / 10;
1304				break;
1305			case B_LOW_RESOURCE_CRITICAL:
1306				count = sUnusedVnodes;
1307				break;
1308		}
1309
1310		if (count > sUnusedVnodes)
1311			count = sUnusedVnodes;
1312	}
1313
1314	// Write back the modified pages of some unused vnodes and free them.
1315
1316	for (uint32 i = 0; i < count; i++) {
1317		ReadLocker vnodesReadLocker(sVnodeLock);
1318
1319		// get the first node
1320		MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1321		struct vnode* vnode = (struct vnode*)list_get_first_item(
1322			&sUnusedVnodeList);
1323		unusedVnodesLocker.Unlock();
1324
1325		if (vnode == NULL)
1326			break;
1327
1328		// lock the node
1329		AutoLocker<Vnode> nodeLocker(vnode);
1330
1331		// Check whether the node is still unused -- since we only append to the
1332		// tail of the unused queue, the vnode should still be at its head.
1333		// Alternatively we could check its ref count for 0 and its busy flag,
1334		// but if the node is no longer at the head of the queue, it means it
1335		// has been touched in the meantime, i.e. it is no longer the least
1336		// recently used unused vnode and we rather don't free it.
1337		unusedVnodesLocker.Lock();
1338		if (vnode != list_get_first_item(&sUnusedVnodeList))
1339			continue;
1340		unusedVnodesLocker.Unlock();
1341
1342		ASSERT(!vnode->IsBusy());
1343
1344		// grab a reference
1345		inc_vnode_ref_count(vnode);
1346		vnode_used(vnode);
1347
1348		// write back changes and free the node
1349		nodeLocker.Unlock();
1350		vnodesReadLocker.Unlock();
1351
1352		if (vnode->cache != NULL)
1353			vnode->cache->WriteModified();
1354
1355		dec_vnode_ref_count(vnode, true, false);
1356			// this should free the vnode when it's still unused
1357	}
1358
1359	unused_vnodes_check_done();
1360}
1361
1362
1363/*!	Gets the vnode the given vnode is covering.
1364
1365	The caller must have \c sVnodeLock read-locked at least.
1366
1367	The function returns a reference to the retrieved vnode (if any), the caller
1368	is responsible to free.
1369
1370	\param vnode The vnode whose covered node shall be returned.
1371	\return The covered vnode, or \c NULL if the given vnode doesn't cover any
1372		vnode.
1373*/
1374static inline Vnode*
1375get_covered_vnode_locked(Vnode* vnode)
1376{
1377	if (Vnode* coveredNode = vnode->covers) {
1378		while (coveredNode->covers != NULL)
1379			coveredNode = coveredNode->covers;
1380
1381		inc_vnode_ref_count(coveredNode);
1382		return coveredNode;
1383	}
1384
1385	return NULL;
1386}
1387
1388
1389/*!	Gets the vnode the given vnode is covering.
1390
1391	The caller must not hold \c sVnodeLock. Note that this implies a race
1392	condition, since the situation can change at any time.
1393
1394	The function returns a reference to the retrieved vnode (if any), the caller
1395	is responsible to free.
1396
1397	\param vnode The vnode whose covered node shall be returned.
1398	\return The covered vnode, or \c NULL if the given vnode doesn't cover any
1399		vnode.
1400*/
1401static inline Vnode*
1402get_covered_vnode(Vnode* vnode)
1403{
1404	if (!vnode->IsCovering())
1405		return NULL;
1406
1407	ReadLocker vnodeReadLocker(sVnodeLock);
1408	return get_covered_vnode_locked(vnode);
1409}
1410
1411
1412/*!	Gets the vnode the given vnode is covered by.
1413
1414	The caller must have \c sVnodeLock read-locked at least.
1415
1416	The function returns a reference to the retrieved vnode (if any), the caller
1417	is responsible to free.
1418
1419	\param vnode The vnode whose covering node shall be returned.
1420	\return The covering vnode, or \c NULL if the given vnode isn't covered by
1421		any vnode.
1422*/
1423static Vnode*
1424get_covering_vnode_locked(Vnode* vnode)
1425{
1426	if (Vnode* coveringNode = vnode->covered_by) {
1427		while (coveringNode->covered_by != NULL)
1428			coveringNode = coveringNode->covered_by;
1429
1430		inc_vnode_ref_count(coveringNode);
1431		return coveringNode;
1432	}
1433
1434	return NULL;
1435}
1436
1437
1438/*!	Gets the vnode the given vnode is covered by.
1439
1440	The caller must not hold \c sVnodeLock. Note that this implies a race
1441	condition, since the situation can change at any time.
1442
1443	The function returns a reference to the retrieved vnode (if any), the caller
1444	is responsible to free.
1445
1446	\param vnode The vnode whose covering node shall be returned.
1447	\return The covering vnode, or \c NULL if the given vnode isn't covered by
1448		any vnode.
1449*/
1450static inline Vnode*
1451get_covering_vnode(Vnode* vnode)
1452{
1453	if (!vnode->IsCovered())
1454		return NULL;
1455
1456	ReadLocker vnodeReadLocker(sVnodeLock);
1457	return get_covering_vnode_locked(vnode);
1458}
1459
1460
1461static void
1462free_unused_vnodes()
1463{
1464	free_unused_vnodes(
1465		low_resource_state(B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1466			| B_KERNEL_RESOURCE_ADDRESS_SPACE));
1467}
1468
1469
1470static void
1471vnode_low_resource_handler(void* /*data*/, uint32 resources, int32 level)
1472{
1473	TRACE(("vnode_low_resource_handler(level = %" B_PRId32 ")\n", level));
1474
1475	free_unused_vnodes(level);
1476}
1477
1478
1479static inline void
1480put_advisory_locking(struct advisory_locking* locking)
1481{
1482	release_sem(locking->lock);
1483}
1484
1485
1486/*!	Returns the advisory_locking object of the \a vnode in case it
1487	has one, and locks it.
1488	You have to call put_advisory_locking() when you're done with
1489	it.
1490	Note, you must not have the vnode mutex locked when calling
1491	this function.
1492*/
1493static struct advisory_locking*
1494get_advisory_locking(struct vnode* vnode)
1495{
1496	rw_lock_read_lock(&sVnodeLock);
1497	vnode->Lock();
1498
1499	struct advisory_locking* locking = vnode->advisory_locking;
1500	sem_id lock = locking != NULL ? locking->lock : B_ERROR;
1501
1502	vnode->Unlock();
1503	rw_lock_read_unlock(&sVnodeLock);
1504
1505	if (lock >= 0)
1506		lock = acquire_sem(lock);
1507	if (lock < 0) {
1508		// This means the locking has been deleted in the mean time
1509		// or had never existed in the first place - otherwise, we
1510		// would get the lock at some point.
1511		return NULL;
1512	}
1513
1514	return locking;
1515}
1516
1517
1518/*!	Creates a locked advisory_locking object, and attaches it to the
1519	given \a vnode.
1520	Returns B_OK in case of success - also if the vnode got such an
1521	object from someone else in the mean time, you'll still get this
1522	one locked then.
1523*/
1524static status_t
1525create_advisory_locking(struct vnode* vnode)
1526{
1527	if (vnode == NULL)
1528		return B_FILE_ERROR;
1529
1530	ObjectDeleter<advisory_locking> lockingDeleter;
1531	struct advisory_locking* locking = NULL;
1532
1533	while (get_advisory_locking(vnode) == NULL) {
1534		// no locking object set on the vnode yet, create one
1535		if (locking == NULL) {
1536			locking = new(std::nothrow) advisory_locking;
1537			if (locking == NULL)
1538				return B_NO_MEMORY;
1539			lockingDeleter.SetTo(locking);
1540
1541			locking->wait_sem = create_sem(0, "advisory lock");
1542			if (locking->wait_sem < 0)
1543				return locking->wait_sem;
1544
1545			locking->lock = create_sem(0, "advisory locking");
1546			if (locking->lock < 0)
1547				return locking->lock;
1548		}
1549
1550		// set our newly created locking object
1551		ReadLocker _(sVnodeLock);
1552		AutoLocker<Vnode> nodeLocker(vnode);
1553		if (vnode->advisory_locking == NULL) {
1554			vnode->advisory_locking = locking;
1555			lockingDeleter.Detach();
1556			return B_OK;
1557		}
1558	}
1559
1560	// The vnode already had a locking object. That's just as well.
1561
1562	return B_OK;
1563}
1564
1565
1566/*! Returns \c true when either \a flock is \c NULL or the \a flock intersects
1567	with the advisory_lock \a lock.
1568*/
1569static bool
1570advisory_lock_intersects(struct advisory_lock* lock, struct flock* flock)
1571{
1572	if (flock == NULL)
1573		return true;
1574
1575	return lock->start <= flock->l_start - 1 + flock->l_len
1576		&& lock->end >= flock->l_start;
1577}
1578
1579
1580/*!	Tests whether acquiring a lock would block.
1581*/
1582static status_t
1583test_advisory_lock(struct vnode* vnode, struct flock* flock)
1584{
1585	flock->l_type = F_UNLCK;
1586
1587	struct advisory_locking* locking = get_advisory_locking(vnode);
1588	if (locking == NULL)
1589		return B_OK;
1590
1591	team_id team = team_get_current_team_id();
1592
1593	LockList::Iterator iterator = locking->locks.GetIterator();
1594	while (iterator.HasNext()) {
1595		struct advisory_lock* lock = iterator.Next();
1596
1597		 if (lock->team != team && advisory_lock_intersects(lock, flock)) {
1598			// locks do overlap
1599			if (flock->l_type != F_RDLCK || !lock->shared) {
1600				// collision
1601				flock->l_type = lock->shared ? F_RDLCK : F_WRLCK;
1602				flock->l_whence = SEEK_SET;
1603				flock->l_start = lock->start;
1604				flock->l_len = lock->end - lock->start + 1;
1605				flock->l_pid = lock->team;
1606				break;
1607			}
1608		}
1609	}
1610
1611	put_advisory_locking(locking);
1612	return B_OK;
1613}
1614
1615
1616/*!	Removes the specified lock, or all locks of the calling team
1617	if \a flock is NULL.
1618*/
1619static status_t
1620release_advisory_lock(struct vnode* vnode, struct io_context* context,
1621	struct file_descriptor* descriptor, struct flock* flock)
1622{
1623	FUNCTION(("release_advisory_lock(vnode = %p, flock = %p)\n", vnode, flock));
1624
1625	struct advisory_locking* locking = get_advisory_locking(vnode);
1626	if (locking == NULL)
1627		return B_OK;
1628
1629	// find matching lock entries
1630
1631	LockList::Iterator iterator = locking->locks.GetIterator();
1632	while (iterator.HasNext()) {
1633		struct advisory_lock* lock = iterator.Next();
1634		bool removeLock = false;
1635
1636		if (descriptor != NULL && lock->bound_to == descriptor) {
1637			// Remove flock() locks
1638			removeLock = true;
1639		} else if (lock->bound_to == context
1640				&& advisory_lock_intersects(lock, flock)) {
1641			// Remove POSIX locks
1642			bool endsBeyond = false;
1643			bool startsBefore = false;
1644			if (flock != NULL) {
1645				startsBefore = lock->start < flock->l_start;
1646				endsBeyond = lock->end > flock->l_start - 1 + flock->l_len;
1647			}
1648
1649			if (!startsBefore && !endsBeyond) {
1650				// lock is completely contained in flock
1651				removeLock = true;
1652			} else if (startsBefore && !endsBeyond) {
1653				// cut the end of the lock
1654				lock->end = flock->l_start - 1;
1655			} else if (!startsBefore && endsBeyond) {
1656				// cut the start of the lock
1657				lock->start = flock->l_start + flock->l_len;
1658			} else {
1659				// divide the lock into two locks
1660				struct advisory_lock* secondLock = new advisory_lock;
1661				if (secondLock == NULL) {
1662					// TODO: we should probably revert the locks we already
1663					// changed... (ie. allocate upfront)
1664					put_advisory_locking(locking);
1665					return B_NO_MEMORY;
1666				}
1667
1668				lock->end = flock->l_start - 1;
1669
1670				secondLock->bound_to = context;
1671				secondLock->team = lock->team;
1672				secondLock->session = lock->session;
1673				// values must already be normalized when getting here
1674				secondLock->start = flock->l_start + flock->l_len;
1675				secondLock->end = lock->end;
1676				secondLock->shared = lock->shared;
1677
1678				locking->locks.Add(secondLock);
1679			}
1680		}
1681
1682		if (removeLock) {
1683			// this lock is no longer used
1684			iterator.Remove();
1685			delete lock;
1686		}
1687	}
1688
1689	bool removeLocking = locking->locks.IsEmpty();
1690	release_sem_etc(locking->wait_sem, 1, B_RELEASE_ALL);
1691
1692	put_advisory_locking(locking);
1693
1694	if (removeLocking) {
1695		// We can remove the whole advisory locking structure; it's no
1696		// longer used
1697		locking = get_advisory_locking(vnode);
1698		if (locking != NULL) {
1699			ReadLocker locker(sVnodeLock);
1700			AutoLocker<Vnode> nodeLocker(vnode);
1701
1702			// the locking could have been changed in the mean time
1703			if (locking->locks.IsEmpty()) {
1704				vnode->advisory_locking = NULL;
1705				nodeLocker.Unlock();
1706				locker.Unlock();
1707
1708				// we've detached the locking from the vnode, so we can
1709				// safely delete it
1710				delete locking;
1711			} else {
1712				// the locking is in use again
1713				nodeLocker.Unlock();
1714				locker.Unlock();
1715				release_sem_etc(locking->lock, 1, B_DO_NOT_RESCHEDULE);
1716			}
1717		}
1718	}
1719
1720	return B_OK;
1721}
1722
1723
1724/*!	Acquires an advisory lock for the \a vnode. If \a wait is \c true, it
1725	will wait for the lock to become available, if there are any collisions
1726	(it will return B_PERMISSION_DENIED in this case if \a wait is \c false).
1727
1728	If \a descriptor is NULL, POSIX semantics are used for this lock. Otherwise,
1729	BSD flock() semantics are used, that is, all children can unlock the file
1730	in question (we even allow parents to remove the lock, though, but that
1731	seems to be in line to what the BSD's are doing).
1732*/
1733static status_t
1734acquire_advisory_lock(struct vnode* vnode, io_context* context,
1735	struct file_descriptor* descriptor, struct flock* flock, bool wait)
1736{
1737	FUNCTION(("acquire_advisory_lock(vnode = %p, flock = %p, wait = %s)\n",
1738		vnode, flock, wait ? "yes" : "no"));
1739
1740	bool shared = flock->l_type == F_RDLCK;
1741	void* boundTo = descriptor != NULL ? (void*)descriptor : (void*)context;
1742	status_t status = B_OK;
1743
1744	// TODO: do deadlock detection!
1745
1746	struct advisory_locking* locking;
1747
1748	while (true) {
1749		// if this vnode has an advisory_locking structure attached,
1750		// lock that one and search for any colliding file lock
1751		status = create_advisory_locking(vnode);
1752		if (status != B_OK)
1753			return status;
1754
1755		locking = vnode->advisory_locking;
1756		team_id team = team_get_current_team_id();
1757		sem_id waitForLock = -1;
1758
1759		// test for collisions
1760		LockList::Iterator iterator = locking->locks.GetIterator();
1761		while (iterator.HasNext()) {
1762			struct advisory_lock* lock = iterator.Next();
1763
1764			// TODO: locks from the same team might be joinable!
1765			if ((lock->team != team || lock->bound_to != boundTo)
1766					&& advisory_lock_intersects(lock, flock)) {
1767				// locks do overlap
1768				if (!shared || !lock->shared) {
1769					// we need to wait
1770					waitForLock = locking->wait_sem;
1771					break;
1772				}
1773			}
1774		}
1775
1776		if (waitForLock < 0)
1777			break;
1778
1779		// We need to wait. Do that or fail now, if we've been asked not to.
1780
1781		if (!wait) {
1782			put_advisory_locking(locking);
1783			return descriptor != NULL ? B_WOULD_BLOCK : B_PERMISSION_DENIED;
1784		}
1785
1786		status = switch_sem_etc(locking->lock, waitForLock, 1,
1787			B_CAN_INTERRUPT, 0);
1788		if (status != B_OK && status != B_BAD_SEM_ID)
1789			return status;
1790
1791		// We have been notified, but we need to re-lock the locking object. So
1792		// go another round...
1793	}
1794
1795	// install new lock
1796
1797	struct advisory_lock* lock = new(std::nothrow) advisory_lock;
1798	if (lock == NULL) {
1799		put_advisory_locking(locking);
1800		return B_NO_MEMORY;
1801	}
1802
1803	lock->bound_to = boundTo;
1804	lock->team = team_get_current_team_id();
1805	lock->session = thread_get_current_thread()->team->session_id;
1806	// values must already be normalized when getting here
1807	lock->start = flock->l_start;
1808	lock->end = flock->l_start - 1 + flock->l_len;
1809	lock->shared = shared;
1810
1811	locking->locks.Add(lock);
1812	put_advisory_locking(locking);
1813
1814	return status;
1815}
1816
1817
1818/*!	Normalizes the \a flock structure to make it easier to compare the
1819	structure with others. The l_start and l_len fields are set to absolute
1820	values according to the l_whence field.
1821*/
1822static status_t
1823normalize_flock(struct file_descriptor* descriptor, struct flock* flock)
1824{
1825	switch (flock->l_whence) {
1826		case SEEK_SET:
1827			break;
1828		case SEEK_CUR:
1829			flock->l_start += descriptor->pos;
1830			break;
1831		case SEEK_END:
1832		{
1833			struct vnode* vnode = descriptor->u.vnode;
1834			struct stat stat;
1835			status_t status;
1836
1837			if (!HAS_FS_CALL(vnode, read_stat))
1838				return B_UNSUPPORTED;
1839
1840			status = FS_CALL(vnode, read_stat, &stat);
1841			if (status != B_OK)
1842				return status;
1843
1844			flock->l_start += stat.st_size;
1845			break;
1846		}
1847		default:
1848			return B_BAD_VALUE;
1849	}
1850
1851	if (flock->l_start < 0)
1852		flock->l_start = 0;
1853	if (flock->l_len == 0)
1854		flock->l_len = OFF_MAX;
1855
1856	// don't let the offset and length overflow
1857	if (flock->l_start > 0 && OFF_MAX - flock->l_start < flock->l_len)
1858		flock->l_len = OFF_MAX - flock->l_start;
1859
1860	if (flock->l_len < 0) {
1861		// a negative length reverses the region
1862		flock->l_start += flock->l_len;
1863		flock->l_len = -flock->l_len;
1864	}
1865
1866	return B_OK;
1867}
1868
1869
1870static void
1871replace_vnode_if_disconnected(struct fs_mount* mount,
1872	struct vnode* vnodeToDisconnect, struct vnode*& vnode,
1873	struct vnode* fallBack, bool lockRootLock)
1874{
1875	struct vnode* givenVnode = vnode;
1876	bool vnodeReplaced = false;
1877
1878	ReadLocker vnodeReadLocker(sVnodeLock);
1879
1880	if (lockRootLock)
1881		mutex_lock(&sIOContextRootLock);
1882
1883	while (vnode != NULL && vnode->mount == mount
1884		&& (vnodeToDisconnect == NULL || vnodeToDisconnect == vnode)) {
1885		if (vnode->covers != NULL) {
1886			// redirect the vnode to the covered vnode
1887			vnode = vnode->covers;
1888		} else
1889			vnode = fallBack;
1890
1891		vnodeReplaced = true;
1892	}
1893
1894	// If we've replaced the node, grab a reference for the new one.
1895	if (vnodeReplaced && vnode != NULL)
1896		inc_vnode_ref_count(vnode);
1897
1898	if (lockRootLock)
1899		mutex_unlock(&sIOContextRootLock);
1900
1901	vnodeReadLocker.Unlock();
1902
1903	if (vnodeReplaced)
1904		put_vnode(givenVnode);
1905}
1906
1907
1908/*!	Disconnects all file descriptors that are associated with the
1909	\a vnodeToDisconnect, or if this is NULL, all vnodes of the specified
1910	\a mount object.
1911
1912	Note, after you've called this function, there might still be ongoing
1913	accesses - they won't be interrupted if they already happened before.
1914	However, any subsequent access will fail.
1915
1916	This is not a cheap function and should be used with care and rarely.
1917	TODO: there is currently no means to stop a blocking read/write!
1918*/
1919static void
1920disconnect_mount_or_vnode_fds(struct fs_mount* mount,
1921	struct vnode* vnodeToDisconnect)
1922{
1923	// iterate over all teams and peek into their file descriptors
1924	TeamListIterator teamIterator;
1925	while (Team* team = teamIterator.Next()) {
1926		BReference<Team> teamReference(team, true);
1927		TeamLocker teamLocker(team);
1928
1929		// lock the I/O context
1930		io_context* context = team->io_context;
1931		if (context == NULL)
1932			continue;
1933		MutexLocker contextLocker(context->io_mutex);
1934
1935		teamLocker.Unlock();
1936
1937		replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->root,
1938			sRoot, true);
1939		replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->cwd,
1940			sRoot, false);
1941
1942		for (uint32 i = 0; i < context->table_size; i++) {
1943			struct file_descriptor* descriptor = context->fds[i];
1944			if (descriptor == NULL || (descriptor->open_mode & O_DISCONNECTED) != 0)
1945				continue;
1946
1947			inc_fd_ref_count(descriptor);
1948
1949			// if this descriptor points at this mount, we
1950			// need to disconnect it to be able to unmount
1951			struct vnode* vnode = fd_vnode(descriptor);
1952			if (vnodeToDisconnect != NULL) {
1953				if (vnode == vnodeToDisconnect)
1954					disconnect_fd(descriptor);
1955			} else if ((vnode != NULL && vnode->mount == mount)
1956				|| (vnode == NULL && descriptor->u.mount == mount))
1957				disconnect_fd(descriptor);
1958
1959			put_fd(descriptor);
1960		}
1961	}
1962}
1963
1964
1965/*!	\brief Gets the root node of the current IO context.
1966	If \a kernel is \c true, the kernel IO context will be used.
1967	The caller obtains a reference to the returned node.
1968*/
1969struct vnode*
1970get_root_vnode(bool kernel)
1971{
1972	if (!kernel) {
1973		// Get current working directory from io context
1974		struct io_context* context = get_current_io_context(kernel);
1975
1976		mutex_lock(&sIOContextRootLock);
1977
1978		struct vnode* root = context->root;
1979		if (root != NULL)
1980			inc_vnode_ref_count(root);
1981
1982		mutex_unlock(&sIOContextRootLock);
1983
1984		if (root != NULL)
1985			return root;
1986
1987		// That should never happen.
1988		dprintf("get_root_vnode(): IO context for team %" B_PRId32 " doesn't "
1989			"have a root\n", team_get_current_team_id());
1990	}
1991
1992	inc_vnode_ref_count(sRoot);
1993	return sRoot;
1994}
1995
1996
1997/*!	\brief Gets the directory path and leaf name for a given path.
1998
1999	The supplied \a path is transformed to refer to the directory part of
2000	the entry identified by the original path, and into the buffer \a filename
2001	the leaf name of the original entry is written.
2002	Neither the returned path nor the leaf name can be expected to be
2003	canonical.
2004
2005	\param path The path to be analyzed. Must be able to store at least one
2006		   additional character.
2007	\param filename The buffer into which the leaf name will be written.
2008		   Must be of size B_FILE_NAME_LENGTH at least.
2009	\return \c B_OK, if everything went fine, \c B_NAME_TOO_LONG, if the leaf
2010		   name is longer than \c B_FILE_NAME_LENGTH, or \c B_ENTRY_NOT_FOUND,
2011		   if the given path name is empty.
2012*/
2013static status_t
2014get_dir_path_and_leaf(char* path, char* filename)
2015{
2016	if (*path == '\0')
2017		return B_ENTRY_NOT_FOUND;
2018
2019	char* last = strrchr(path, '/');
2020		// '/' are not allowed in file names!
2021
2022	FUNCTION(("get_dir_path_and_leaf(path = %s)\n", path));
2023
2024	if (last == NULL) {
2025		// this path is single segment with no '/' in it
2026		// ex. "foo"
2027		if (strlcpy(filename, path, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2028			return B_NAME_TOO_LONG;
2029
2030		strcpy(path, ".");
2031	} else {
2032		last++;
2033		if (last[0] == '\0') {
2034			// special case: the path ends in one or more '/' - remove them
2035			while (*--last == '/' && last != path);
2036			last[1] = '\0';
2037
2038			if (last == path && last[0] == '/') {
2039				// This path points to the root of the file system
2040				strcpy(filename, ".");
2041				return B_OK;
2042			}
2043			for (; last != path && *(last - 1) != '/'; last--);
2044				// rewind to the start of the leaf before the '/'
2045		}
2046
2047		// normal leaf: replace the leaf portion of the path with a '.'
2048		if (strlcpy(filename, last, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2049			return B_NAME_TOO_LONG;
2050
2051		last[0] = '.';
2052		last[1] = '\0';
2053	}
2054	return B_OK;
2055}
2056
2057
2058static status_t
2059entry_ref_to_vnode(dev_t mountID, ino_t directoryID, const char* name,
2060	bool traverse, bool kernel, VnodePutter& _vnode)
2061{
2062	char clonedName[B_FILE_NAME_LENGTH + 1];
2063	if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2064		return B_NAME_TOO_LONG;
2065
2066	// get the directory vnode and let vnode_path_to_vnode() do the rest
2067	struct vnode* directory;
2068
2069	status_t status = get_vnode(mountID, directoryID, &directory, true, false);
2070	if (status < 0)
2071		return status;
2072
2073	return vnode_path_to_vnode(directory, clonedName, traverse, kernel,
2074		_vnode, NULL);
2075}
2076
2077
2078/*!	Looks up the entry with name \a name in the directory represented by \a dir
2079	and returns the respective vnode.
2080	On success a reference to the vnode is acquired for the caller.
2081*/
2082static status_t
2083lookup_dir_entry(struct vnode* dir, const char* name, struct vnode** _vnode)
2084{
2085	ino_t id;
2086	bool missing;
2087
2088	if (dir->mount->entry_cache.Lookup(dir->id, name, id, missing)) {
2089		return missing ? B_ENTRY_NOT_FOUND
2090			: get_vnode(dir->device, id, _vnode, true, false);
2091	}
2092
2093	status_t status = FS_CALL(dir, lookup, name, &id);
2094	if (status != B_OK)
2095		return status;
2096
2097	// The lookup() hook calls get_vnode() or publish_vnode(), so we do already
2098	// have a reference and just need to look the node up.
2099	rw_lock_read_lock(&sVnodeLock);
2100	*_vnode = lookup_vnode(dir->device, id);
2101	rw_lock_read_unlock(&sVnodeLock);
2102
2103	if (*_vnode == NULL) {
2104		panic("lookup_dir_entry(): could not lookup vnode (mountid 0x%" B_PRIx32
2105			" vnid 0x%" B_PRIx64 ")\n", dir->device, id);
2106		return B_ENTRY_NOT_FOUND;
2107	}
2108
2109//	ktrace_printf("lookup_dir_entry(): dir: %p (%ld, %lld), name: \"%s\" -> "
2110//		"%p (%ld, %lld)", dir, dir->mount->id, dir->id, name, *_vnode,
2111//		(*_vnode)->mount->id, (*_vnode)->id);
2112
2113	return B_OK;
2114}
2115
2116
2117/*!	Returns the vnode for the relative \a path starting at the specified \a vnode.
2118
2119	\param[in,out] path The relative path being searched. Must not be NULL.
2120	If the function returns successfully, \a path contains the name of the last path
2121	component. This function clobbers the buffer pointed to by \a path only
2122	if it does contain more than one component.
2123
2124	If the function fails and leafName is not NULL, \a _vnode contains the last directory,
2125	the caller has the responsibility to call put_vnode() on it.
2126
2127	Note, this reduces the ref_count of the starting \a vnode, no matter if
2128	it is successful or not!
2129
2130	\param[out] _vnode If the function returns B_OK, points to the found node.
2131	\param[out] _vnode If the function returns something else and leafname is not NULL: set to the
2132		last existing directory in the path. The caller has responsibility to release it using
2133		put_vnode().
2134	\param[out] _vnode If the function returns something else and leafname is NULL: not used.
2135*/
2136static status_t
2137vnode_path_to_vnode(struct vnode* start, char* path, bool traverseLeafLink,
2138	int count, struct io_context* ioContext, VnodePutter& _vnode,
2139	ino_t* _parentID, char* leafName)
2140{
2141	FUNCTION(("vnode_path_to_vnode(vnode = %p, path = %s)\n", vnode, path));
2142	ASSERT(!_vnode.IsSet());
2143
2144	VnodePutter vnode(start);
2145
2146	if (path == NULL)
2147		return B_BAD_VALUE;
2148	if (*path == '\0')
2149		return B_ENTRY_NOT_FOUND;
2150
2151	status_t status = B_OK;
2152	ino_t lastParentID = vnode->id;
2153	while (true) {
2154		char* nextPath;
2155
2156		TRACE(("vnode_path_to_vnode: top of loop. p = %p, p = '%s'\n", path,
2157			path));
2158
2159		// done?
2160		if (path[0] == '\0')
2161			break;
2162
2163		// walk to find the next path component ("path" will point to a single
2164		// path component), and filter out multiple slashes
2165		for (nextPath = path + 1; *nextPath != '\0' && *nextPath != '/';
2166				nextPath++);
2167
2168		bool directoryFound = false;
2169		if (*nextPath == '/') {
2170			directoryFound = true;
2171			*nextPath = '\0';
2172			do
2173				nextPath++;
2174			while (*nextPath == '/');
2175		}
2176
2177		// See if the '..' is at a covering vnode move to the covered
2178		// vnode so we pass the '..' path to the underlying filesystem.
2179		// Also prevent breaking the root of the IO context.
2180		if (strcmp("..", path) == 0) {
2181			if (vnode.Get() == ioContext->root) {
2182				// Attempted prison break! Keep it contained.
2183				path = nextPath;
2184				continue;
2185			}
2186
2187			if (Vnode* coveredVnode = get_covered_vnode(vnode.Get()))
2188				vnode.SetTo(coveredVnode);
2189		}
2190
2191		// check if vnode is really a directory
2192		if (status == B_OK && !S_ISDIR(vnode->Type()))
2193			status = B_NOT_A_DIRECTORY;
2194
2195		// Check if we have the right to search the current directory vnode.
2196		// If a file system doesn't have the access() function, we assume that
2197		// searching a directory is always allowed
2198		if (status == B_OK && HAS_FS_CALL(vnode, access))
2199			status = FS_CALL(vnode.Get(), access, X_OK);
2200
2201		// Tell the filesystem to get the vnode of this path component (if we
2202		// got the permission from the call above)
2203		VnodePutter nextVnode;
2204		if (status == B_OK) {
2205			struct vnode* temp = NULL;
2206			status = lookup_dir_entry(vnode.Get(), path, &temp);
2207			nextVnode.SetTo(temp);
2208		}
2209
2210		if (status != B_OK) {
2211			if (leafName != NULL) {
2212				strlcpy(leafName, path, B_FILE_NAME_LENGTH);
2213				_vnode.SetTo(vnode.Detach());
2214			}
2215			return status;
2216		}
2217
2218		// If the new node is a symbolic link, resolve it (if we've been told
2219		// to do it)
2220		if (S_ISLNK(nextVnode->Type())
2221			&& (traverseLeafLink || directoryFound)) {
2222			size_t bufferSize;
2223			char* buffer;
2224
2225			TRACE(("traverse link\n"));
2226
2227			if (count + 1 > B_MAX_SYMLINKS)
2228				return B_LINK_LIMIT;
2229
2230			bufferSize = B_PATH_NAME_LENGTH;
2231			buffer = (char*)object_cache_alloc(sPathNameCache, 0);
2232			if (buffer == NULL)
2233				return B_NO_MEMORY;
2234
2235			if (HAS_FS_CALL(nextVnode, read_symlink)) {
2236				bufferSize--;
2237				status = FS_CALL(nextVnode.Get(), read_symlink, buffer, &bufferSize);
2238				// null-terminate
2239				if (status >= 0 && bufferSize < B_PATH_NAME_LENGTH)
2240					buffer[bufferSize] = '\0';
2241			} else
2242				status = B_BAD_VALUE;
2243
2244			if (status != B_OK) {
2245				free(buffer);
2246				return status;
2247			}
2248			nextVnode.Unset();
2249
2250			// Check if we start from the root directory or the current
2251			// directory ("vnode" still points to that one).
2252			// Cut off all leading slashes if it's the root directory
2253			path = buffer;
2254			bool absoluteSymlink = false;
2255			if (path[0] == '/') {
2256				// we don't need the old directory anymore
2257				vnode.Unset();
2258
2259				while (*++path == '/')
2260					;
2261
2262				mutex_lock(&sIOContextRootLock);
2263				vnode.SetTo(ioContext->root);
2264				inc_vnode_ref_count(vnode.Get());
2265				mutex_unlock(&sIOContextRootLock);
2266
2267				absoluteSymlink = true;
2268			}
2269
2270			inc_vnode_ref_count(vnode.Get());
2271				// balance the next recursion - we will decrement the
2272				// ref_count of the vnode, no matter if we succeeded or not
2273
2274			if (absoluteSymlink && *path == '\0') {
2275				// symlink was just "/"
2276				nextVnode.SetTo(vnode.Get());
2277			} else {
2278				status = vnode_path_to_vnode(vnode.Get(), path, true, count + 1,
2279					ioContext, nextVnode, &lastParentID, leafName);
2280			}
2281
2282			object_cache_free(sPathNameCache, buffer, 0);
2283
2284			if (status != B_OK) {
2285				if (leafName != NULL)
2286					_vnode.SetTo(nextVnode.Detach());
2287				return status;
2288			}
2289		} else
2290			lastParentID = vnode->id;
2291
2292		// decrease the ref count on the old dir we just looked up into
2293		vnode.Unset();
2294
2295		path = nextPath;
2296		vnode.SetTo(nextVnode.Detach());
2297
2298		// see if we hit a covered node
2299		if (Vnode* coveringNode = get_covering_vnode(vnode.Get()))
2300			vnode.SetTo(coveringNode);
2301	}
2302
2303	_vnode.SetTo(vnode.Detach());
2304	if (_parentID)
2305		*_parentID = lastParentID;
2306
2307	return B_OK;
2308}
2309
2310
2311static status_t
2312vnode_path_to_vnode(struct vnode* vnode, char* path, bool traverseLeafLink,
2313	bool kernel, VnodePutter& _vnode, ino_t* _parentID, char* leafName)
2314{
2315	return vnode_path_to_vnode(vnode, path, traverseLeafLink, 0,
2316		get_current_io_context(kernel), _vnode, _parentID, leafName);
2317}
2318
2319
2320static status_t
2321path_to_vnode(char* path, bool traverseLink, VnodePutter& _vnode,
2322	ino_t* _parentID, bool kernel)
2323{
2324	struct vnode* start = NULL;
2325
2326	FUNCTION(("path_to_vnode(path = \"%s\")\n", path));
2327
2328	if (!path)
2329		return B_BAD_VALUE;
2330
2331	if (*path == '\0')
2332		return B_ENTRY_NOT_FOUND;
2333
2334	// figure out if we need to start at root or at cwd
2335	if (*path == '/') {
2336		if (sRoot == NULL) {
2337			// we're a bit early, aren't we?
2338			return B_ERROR;
2339		}
2340
2341		while (*++path == '/')
2342			;
2343		start = get_root_vnode(kernel);
2344
2345		if (*path == '\0') {
2346			_vnode.SetTo(start);
2347			return B_OK;
2348		}
2349
2350	} else {
2351		struct io_context* context = get_current_io_context(kernel);
2352
2353		mutex_lock(&context->io_mutex);
2354		start = context->cwd;
2355		if (start != NULL)
2356			inc_vnode_ref_count(start);
2357		mutex_unlock(&context->io_mutex);
2358
2359		if (start == NULL)
2360			return B_ERROR;
2361	}
2362
2363	return vnode_path_to_vnode(start, path, traverseLink, kernel, _vnode,
2364		_parentID);
2365}
2366
2367
2368/*! Returns the vnode in the next to last segment of the path, and returns
2369	the last portion in filename.
2370	The path buffer must be able to store at least one additional character.
2371*/
2372static status_t
2373path_to_dir_vnode(char* path, VnodePutter& _vnode, char* filename,
2374	bool kernel)
2375{
2376	status_t status = get_dir_path_and_leaf(path, filename);
2377	if (status != B_OK)
2378		return status;
2379
2380	return path_to_vnode(path, true, _vnode, NULL, kernel);
2381}
2382
2383
2384/*!	\brief Retrieves the directory vnode and the leaf name of an entry referred
2385		   to by a FD + path pair.
2386
2387	\a path must be given in either case. \a fd might be omitted, in which
2388	case \a path is either an absolute path or one relative to the current
2389	directory. If both a supplied and \a path is relative it is reckoned off
2390	of the directory referred to by \a fd. If \a path is absolute \a fd is
2391	ignored.
2392
2393	The caller has the responsibility to call put_vnode() on the returned
2394	directory vnode.
2395
2396	\param fd The FD. May be < 0.
2397	\param path The absolute or relative path. Must not be \c NULL. The buffer
2398	       is modified by this function. It must have at least room for a
2399	       string one character longer than the path it contains.
2400	\param _vnode A pointer to a variable the directory vnode shall be written
2401		   into.
2402	\param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2403		   the leaf name of the specified entry will be written.
2404	\param kernel \c true, if invoked from inside the kernel, \c false if
2405		   invoked from userland.
2406	\return \c B_OK, if everything went fine, another error code otherwise.
2407*/
2408static status_t
2409fd_and_path_to_dir_vnode(int fd, char* path, VnodePutter& _vnode,
2410	char* filename, bool kernel)
2411{
2412	if (!path)
2413		return B_BAD_VALUE;
2414	if (*path == '\0')
2415		return B_ENTRY_NOT_FOUND;
2416	if (fd < 0)
2417		return path_to_dir_vnode(path, _vnode, filename, kernel);
2418
2419	status_t status = get_dir_path_and_leaf(path, filename);
2420	if (status != B_OK)
2421		return status;
2422
2423	return fd_and_path_to_vnode(fd, path, true, _vnode, NULL, kernel);
2424}
2425
2426
2427/*!	\brief Retrieves the directory vnode and the leaf name of an entry referred
2428		   to by a vnode + path pair.
2429
2430	\a path must be given in either case. \a vnode might be omitted, in which
2431	case \a path is either an absolute path or one relative to the current
2432	directory. If both a supplied and \a path is relative it is reckoned off
2433	of the directory referred to by \a vnode. If \a path is absolute \a vnode is
2434	ignored.
2435
2436	The caller has the responsibility to call put_vnode() on the returned
2437	directory vnode.
2438
2439	Note, this reduces the ref_count of the starting \a vnode, no matter if
2440	it is successful or not.
2441
2442	\param vnode The vnode. May be \c NULL.
2443	\param path The absolute or relative path. Must not be \c NULL. The buffer
2444	       is modified by this function. It must have at least room for a
2445	       string one character longer than the path it contains.
2446	\param _vnode A pointer to a variable the directory vnode shall be written
2447		   into.
2448	\param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2449		   the leaf name of the specified entry will be written.
2450	\param kernel \c true, if invoked from inside the kernel, \c false if
2451		   invoked from userland.
2452	\return \c B_OK, if everything went fine, another error code otherwise.
2453*/
2454static status_t
2455vnode_and_path_to_dir_vnode(struct vnode* vnode, char* path,
2456	VnodePutter& _vnode, char* filename, bool kernel)
2457{
2458	VnodePutter vnodePutter(vnode);
2459
2460	if (!path)
2461		return B_BAD_VALUE;
2462	if (*path == '\0')
2463		return B_ENTRY_NOT_FOUND;
2464	if (vnode == NULL || path[0] == '/')
2465		return path_to_dir_vnode(path, _vnode, filename, kernel);
2466
2467	status_t status = get_dir_path_and_leaf(path, filename);
2468	if (status != B_OK)
2469		return status;
2470
2471	vnodePutter.Detach();
2472	return vnode_path_to_vnode(vnode, path, true, kernel, _vnode, NULL);
2473}
2474
2475
2476/*! Returns a vnode's name in the d_name field of a supplied dirent buffer.
2477*/
2478static status_t
2479get_vnode_name(struct vnode* vnode, struct vnode* parent, struct dirent* buffer,
2480	size_t bufferSize, struct io_context* ioContext)
2481{
2482	if (bufferSize < sizeof(struct dirent))
2483		return B_BAD_VALUE;
2484
2485	// See if the vnode is covering another vnode and move to the covered
2486	// vnode so we get the underlying file system
2487	VnodePutter vnodePutter;
2488	if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2489		vnode = coveredVnode;
2490		vnodePutter.SetTo(vnode);
2491	}
2492
2493	if (HAS_FS_CALL(vnode, get_vnode_name)) {
2494		// The FS supports getting the name of a vnode.
2495		if (FS_CALL(vnode, get_vnode_name, buffer->d_name,
2496			(char*)buffer + bufferSize - buffer->d_name) == B_OK)
2497			return B_OK;
2498	}
2499
2500	// The FS doesn't support getting the name of a vnode. So we search the
2501	// parent directory for the vnode, if the caller let us.
2502
2503	if (parent == NULL || !HAS_FS_CALL(parent, read_dir))
2504		return B_UNSUPPORTED;
2505
2506	void* cookie;
2507
2508	status_t status = FS_CALL(parent, open_dir, &cookie);
2509	if (status >= B_OK) {
2510		while (true) {
2511			uint32 num = 1;
2512			// We use the FS hook directly instead of dir_read(), since we don't
2513			// want the entries to be fixed. We have already resolved vnode to
2514			// the covered node.
2515			status = FS_CALL(parent, read_dir, cookie, buffer, bufferSize,
2516				&num);
2517			if (status != B_OK)
2518				break;
2519			if (num == 0) {
2520				status = B_ENTRY_NOT_FOUND;
2521				break;
2522			}
2523
2524			if (vnode->id == buffer->d_ino) {
2525				// found correct entry!
2526				break;
2527			}
2528		}
2529
2530		FS_CALL(parent, close_dir, cookie);
2531		FS_CALL(parent, free_dir_cookie, cookie);
2532	}
2533	return status;
2534}
2535
2536
2537static status_t
2538get_vnode_name(struct vnode* vnode, struct vnode* parent, char* name,
2539	size_t nameSize, bool kernel)
2540{
2541	char buffer[offsetof(struct dirent, d_name) + B_FILE_NAME_LENGTH + 1];
2542	struct dirent* dirent = (struct dirent*)buffer;
2543
2544	status_t status = get_vnode_name(vnode, parent, dirent, sizeof(buffer),
2545		get_current_io_context(kernel));
2546	if (status != B_OK)
2547		return status;
2548
2549	if (strlcpy(name, dirent->d_name, nameSize) >= nameSize)
2550		return B_BUFFER_OVERFLOW;
2551
2552	return B_OK;
2553}
2554
2555
2556/*!	Gets the full path to a given directory vnode.
2557	It uses the fs_get_vnode_name() call to get the name of a vnode; if a
2558	file system doesn't support this call, it will fall back to iterating
2559	through the parent directory to get the name of the child.
2560
2561	To protect against circular loops, it supports a maximum tree depth
2562	of 256 levels.
2563
2564	Note that the path may not be correct the time this function returns!
2565	It doesn't use any locking to prevent returning the correct path, as
2566	paths aren't safe anyway: the path to a file can change at any time.
2567
2568	It might be a good idea, though, to check if the returned path exists
2569	in the calling function (it's not done here because of efficiency)
2570*/
2571static status_t
2572dir_vnode_to_path(struct vnode* vnode, char* buffer, size_t bufferSize,
2573	bool kernel)
2574{
2575	FUNCTION(("dir_vnode_to_path(%p, %p, %lu)\n", vnode, buffer, bufferSize));
2576
2577	if (vnode == NULL || buffer == NULL || bufferSize == 0)
2578		return B_BAD_VALUE;
2579
2580	if (!S_ISDIR(vnode->Type()))
2581		return B_NOT_A_DIRECTORY;
2582
2583	char* path = buffer;
2584	int32 insert = bufferSize;
2585	int32 maxLevel = 256;
2586	int32 length;
2587	status_t status = B_OK;
2588	struct io_context* ioContext = get_current_io_context(kernel);
2589
2590	// we don't use get_vnode() here because this call is more
2591	// efficient and does all we need from get_vnode()
2592	inc_vnode_ref_count(vnode);
2593
2594	path[--insert] = '\0';
2595		// the path is filled right to left
2596
2597	while (true) {
2598		// If the node is the context's root, bail out. Otherwise resolve mount
2599		// points.
2600		if (vnode == ioContext->root)
2601			break;
2602
2603		if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2604			put_vnode(vnode);
2605			vnode = coveredVnode;
2606		}
2607
2608		// lookup the parent vnode
2609		struct vnode* parentVnode;
2610		status = lookup_dir_entry(vnode, "..", &parentVnode);
2611		if (status != B_OK)
2612			goto out;
2613
2614		if (parentVnode == vnode) {
2615			// The caller apparently got their hands on a node outside of their
2616			// context's root. Now we've hit the global root.
2617			put_vnode(parentVnode);
2618			break;
2619		}
2620
2621		// get the node's name
2622		char nameBuffer[offsetof(struct dirent, d_name) + B_FILE_NAME_LENGTH + 1];
2623			// also used for fs_read_dir()
2624		char* name = &((struct dirent*)nameBuffer)->d_name[0];
2625		status = get_vnode_name(vnode, parentVnode, (struct dirent*)nameBuffer,
2626			sizeof(nameBuffer), ioContext);
2627
2628		// release the current vnode, we only need its parent from now on
2629		put_vnode(vnode);
2630		vnode = parentVnode;
2631
2632		if (status != B_OK)
2633			goto out;
2634
2635		// TODO: add an explicit check for loops in about 10 levels to do
2636		// real loop detection
2637
2638		// don't go deeper as 'maxLevel' to prevent circular loops
2639		if (maxLevel-- < 0) {
2640			status = B_LINK_LIMIT;
2641			goto out;
2642		}
2643
2644		// add the name in front of the current path
2645		name[B_FILE_NAME_LENGTH - 1] = '\0';
2646		length = strlen(name);
2647		insert -= length;
2648		if (insert <= 0) {
2649			status = B_RESULT_NOT_REPRESENTABLE;
2650			goto out;
2651		}
2652		memcpy(path + insert, name, length);
2653		path[--insert] = '/';
2654	}
2655
2656	// the root dir will result in an empty path: fix it
2657	if (path[insert] == '\0')
2658		path[--insert] = '/';
2659
2660	TRACE(("  path is: %s\n", path + insert));
2661
2662	// move the path to the start of the buffer
2663	length = bufferSize - insert;
2664	memmove(buffer, path + insert, length);
2665
2666out:
2667	put_vnode(vnode);
2668	return status;
2669}
2670
2671
2672/*!	Checks the length of every path component, and adds a '.'
2673	if the path ends in a slash.
2674	The given path buffer must be able to store at least one
2675	additional character.
2676*/
2677static status_t
2678check_path(char* to)
2679{
2680	int32 length = 0;
2681
2682	// check length of every path component
2683
2684	while (*to) {
2685		char* begin;
2686		if (*to == '/')
2687			to++, length++;
2688
2689		begin = to;
2690		while (*to != '/' && *to)
2691			to++, length++;
2692
2693		if (to - begin > B_FILE_NAME_LENGTH)
2694			return B_NAME_TOO_LONG;
2695	}
2696
2697	if (length == 0)
2698		return B_ENTRY_NOT_FOUND;
2699
2700	// complete path if there is a slash at the end
2701
2702	if (*(to - 1) == '/') {
2703		if (length > B_PATH_NAME_LENGTH - 2)
2704			return B_NAME_TOO_LONG;
2705
2706		to[0] = '.';
2707		to[1] = '\0';
2708	}
2709
2710	return B_OK;
2711}
2712
2713
2714static struct file_descriptor*
2715get_fd_and_vnode(int fd, struct vnode** _vnode, bool kernel)
2716{
2717	struct file_descriptor* descriptor
2718		= get_fd(get_current_io_context(kernel), fd);
2719	if (descriptor == NULL)
2720		return NULL;
2721
2722	struct vnode* vnode = fd_vnode(descriptor);
2723	if (vnode == NULL) {
2724		put_fd(descriptor);
2725		return NULL;
2726	}
2727
2728	// ToDo: when we can close a file descriptor at any point, investigate
2729	//	if this is still valid to do (accessing the vnode without ref_count
2730	//	or locking)
2731	*_vnode = vnode;
2732	return descriptor;
2733}
2734
2735
2736static struct vnode*
2737get_vnode_from_fd(int fd, bool kernel)
2738{
2739	struct file_descriptor* descriptor;
2740	struct vnode* vnode;
2741
2742	descriptor = get_fd(get_current_io_context(kernel), fd);
2743	if (descriptor == NULL)
2744		return NULL;
2745
2746	vnode = fd_vnode(descriptor);
2747	if (vnode != NULL)
2748		inc_vnode_ref_count(vnode);
2749
2750	put_fd(descriptor);
2751	return vnode;
2752}
2753
2754
2755/*!	Gets the vnode from an FD + path combination. If \a fd is lower than zero,
2756	only the path will be considered. In this case, the \a path must not be
2757	NULL.
2758	If \a fd is a valid file descriptor, \a path may be NULL for directories,
2759	and should be NULL for files.
2760*/
2761static status_t
2762fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
2763	VnodePutter& _vnode, ino_t* _parentID, bool kernel)
2764{
2765	if (fd < 0 && !path)
2766		return B_BAD_VALUE;
2767
2768	if (path != NULL && *path == '\0')
2769		return B_ENTRY_NOT_FOUND;
2770
2771	if (fd < 0 || (path != NULL && path[0] == '/')) {
2772		// no FD or absolute path
2773		return path_to_vnode(path, traverseLeafLink, _vnode, _parentID, kernel);
2774	}
2775
2776	// FD only, or FD + relative path
2777	struct vnode* vnode = get_vnode_from_fd(fd, kernel);
2778	if (vnode == NULL)
2779		return B_FILE_ERROR;
2780
2781	if (path != NULL) {
2782		return vnode_path_to_vnode(vnode, path, traverseLeafLink, kernel,
2783			_vnode, _parentID);
2784	}
2785
2786	// there is no relative path to take into account
2787
2788	_vnode.SetTo(vnode);
2789	if (_parentID)
2790		*_parentID = -1;
2791
2792	return B_OK;
2793}
2794
2795
2796struct vnode*
2797fd_vnode(struct file_descriptor* descriptor)
2798{
2799	if (descriptor->ops == &sFileOps
2800			|| descriptor->ops == &sDirectoryOps
2801			|| descriptor->ops == &sAttributeOps
2802			|| descriptor->ops == &sAttributeDirectoryOps)
2803		return descriptor->u.vnode;
2804
2805	return NULL;
2806}
2807
2808
2809bool
2810fd_is_file(struct file_descriptor* descriptor)
2811{
2812	return descriptor->ops == &sFileOps;
2813}
2814
2815
2816static int
2817get_new_fd(struct fd_ops* ops, struct fs_mount* mount, struct vnode* vnode,
2818	void* cookie, int openMode, bool kernel)
2819{
2820	struct file_descriptor* descriptor;
2821	int fd;
2822
2823	// If the vnode is locked, we don't allow creating a new file/directory
2824	// file_descriptor for it
2825	if (vnode && vnode->mandatory_locked_by != NULL
2826		&& (ops == &sFileOps || ops == &sDirectoryOps))
2827		return B_BUSY;
2828
2829	if ((openMode & O_RDWR) != 0 && (openMode & O_WRONLY) != 0)
2830		return B_BAD_VALUE;
2831
2832	descriptor = alloc_fd();
2833	if (!descriptor)
2834		return B_NO_MEMORY;
2835
2836	if (vnode)
2837		descriptor->u.vnode = vnode;
2838	else
2839		descriptor->u.mount = mount;
2840	descriptor->cookie = cookie;
2841
2842	descriptor->ops = ops;
2843	descriptor->open_mode = openMode;
2844
2845	if (descriptor->ops->fd_seek != NULL) {
2846		// some kinds of files are not seekable
2847		switch (vnode->Type() & S_IFMT) {
2848			case S_IFIFO:
2849			case S_IFSOCK:
2850				ASSERT(descriptor->pos == -1);
2851				break;
2852
2853			// The Open Group Base Specs don't mention any file types besides pipes,
2854			// FIFOs, and sockets specially, so we allow seeking all others.
2855			default:
2856				descriptor->pos = 0;
2857				break;
2858		}
2859	}
2860
2861	io_context* context = get_current_io_context(kernel);
2862	fd = new_fd(context, descriptor);
2863	if (fd < 0) {
2864		descriptor->ops = NULL;
2865		put_fd(descriptor);
2866		return B_NO_MORE_FDS;
2867	}
2868
2869	mutex_lock(&context->io_mutex);
2870	fd_set_close_on_exec(context, fd, (openMode & O_CLOEXEC) != 0);
2871	mutex_unlock(&context->io_mutex);
2872
2873	return fd;
2874}
2875
2876
2877/*!	In-place normalizes \a path. It's otherwise semantically equivalent to
2878	vfs_normalize_path(). See there for more documentation.
2879*/
2880static status_t
2881normalize_path(char* path, size_t pathSize, bool traverseLink, bool kernel)
2882{
2883	VnodePutter dir;
2884	status_t error;
2885
2886	for (int i = 0; i < B_MAX_SYMLINKS; i++) {
2887		// get dir vnode + leaf name
2888		char leaf[B_FILE_NAME_LENGTH];
2889		error = vnode_and_path_to_dir_vnode(dir.Detach(), path, dir, leaf, kernel);
2890		if (error != B_OK)
2891			return error;
2892		strcpy(path, leaf);
2893
2894		// get file vnode, if we shall resolve links
2895		bool fileExists = false;
2896		VnodePutter fileVnode;
2897		if (traverseLink) {
2898			inc_vnode_ref_count(dir.Get());
2899			if (vnode_path_to_vnode(dir.Get(), path, false, kernel, fileVnode,
2900					NULL) == B_OK) {
2901				fileExists = true;
2902			}
2903		}
2904
2905		if (!fileExists || !traverseLink || !S_ISLNK(fileVnode->Type())) {
2906			// we're done -- construct the path
2907			bool hasLeaf = true;
2908			if (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0) {
2909				// special cases "." and ".." -- get the dir, forget the leaf
2910				error = vnode_path_to_vnode(dir.Detach(), leaf, false, kernel,
2911					dir, NULL);
2912				if (error != B_OK)
2913					return error;
2914				hasLeaf = false;
2915			}
2916
2917			// get the directory path
2918			error = dir_vnode_to_path(dir.Get(), path, B_PATH_NAME_LENGTH, kernel);
2919			if (error != B_OK)
2920				return error;
2921
2922			// append the leaf name
2923			if (hasLeaf) {
2924				// insert a directory separator if this is not the file system
2925				// root
2926				if ((strcmp(path, "/") != 0
2927					&& strlcat(path, "/", pathSize) >= pathSize)
2928					|| strlcat(path, leaf, pathSize) >= pathSize) {
2929					return B_NAME_TOO_LONG;
2930				}
2931			}
2932
2933			return B_OK;
2934		}
2935
2936		// read link
2937		if (HAS_FS_CALL(fileVnode, read_symlink)) {
2938			size_t bufferSize = B_PATH_NAME_LENGTH - 1;
2939			error = FS_CALL(fileVnode.Get(), read_symlink, path, &bufferSize);
2940			if (error != B_OK)
2941				return error;
2942			if (bufferSize < B_PATH_NAME_LENGTH)
2943				path[bufferSize] = '\0';
2944		} else
2945			return B_BAD_VALUE;
2946	}
2947
2948	return B_LINK_LIMIT;
2949}
2950
2951
2952static status_t
2953resolve_covered_parent(struct vnode* parent, dev_t* _device, ino_t* _node,
2954	struct io_context* ioContext)
2955{
2956	// Make sure the IO context root is not bypassed.
2957	if (parent == ioContext->root) {
2958		*_device = parent->device;
2959		*_node = parent->id;
2960		return B_OK;
2961	}
2962
2963	inc_vnode_ref_count(parent);
2964		// vnode_path_to_vnode() puts the node
2965
2966	// ".." is guaranteed not to be clobbered by this call
2967	VnodePutter vnode;
2968	status_t status = vnode_path_to_vnode(parent, (char*)"..", false,
2969		ioContext, vnode, NULL);
2970	if (status == B_OK) {
2971		*_device = vnode->device;
2972		*_node = vnode->id;
2973	}
2974
2975	return status;
2976}
2977
2978
2979#ifdef ADD_DEBUGGER_COMMANDS
2980
2981
2982static void
2983_dump_advisory_locking(advisory_locking* locking)
2984{
2985	if (locking == NULL)
2986		return;
2987
2988	kprintf("   lock:        %" B_PRId32, locking->lock);
2989	kprintf("   wait_sem:    %" B_PRId32, locking->wait_sem);
2990
2991	int32 index = 0;
2992	LockList::Iterator iterator = locking->locks.GetIterator();
2993	while (iterator.HasNext()) {
2994		struct advisory_lock* lock = iterator.Next();
2995
2996		kprintf("   [%2" B_PRId32 "] team:   %" B_PRId32 "\n", index++, lock->team);
2997		kprintf("        start:  %" B_PRIdOFF "\n", lock->start);
2998		kprintf("        end:    %" B_PRIdOFF "\n", lock->end);
2999		kprintf("        shared? %s\n", lock->shared ? "yes" : "no");
3000	}
3001}
3002
3003
3004static void
3005_dump_mount(struct fs_mount* mount)
3006{
3007	kprintf("MOUNT: %p\n", mount);
3008	kprintf(" id:            %" B_PRIdDEV "\n", mount->id);
3009	kprintf(" device_name:   %s\n", mount->device_name);
3010	kprintf(" root_vnode:    %p\n", mount->root_vnode);
3011	kprintf(" covers:        %p\n", mount->root_vnode->covers);
3012	kprintf(" partition:     %p\n", mount->partition);
3013	kprintf(" lock:          %p\n", &mount->lock);
3014	kprintf(" flags:        %s%s\n", mount->unmounting ? " unmounting" : "",
3015		mount->owns_file_device ? " owns_file_device" : "");
3016
3017	fs_volume* volume = mount->volume;
3018	while (volume != NULL) {
3019		kprintf(" volume %p:\n", volume);
3020		kprintf("  layer:            %" B_PRId32 "\n", volume->layer);
3021		kprintf("  private_volume:   %p\n", volume->private_volume);
3022		kprintf("  ops:              %p\n", volume->ops);
3023		kprintf("  file_system:      %p\n", volume->file_system);
3024		kprintf("  file_system_name: %s\n", volume->file_system_name);
3025		volume = volume->super_volume;
3026	}
3027
3028	set_debug_variable("_volume", (addr_t)mount->volume->private_volume);
3029	set_debug_variable("_root", (addr_t)mount->root_vnode);
3030	set_debug_variable("_covers", (addr_t)mount->root_vnode->covers);
3031	set_debug_variable("_partition", (addr_t)mount->partition);
3032}
3033
3034
3035static bool
3036debug_prepend_vnode_name_to_path(char* buffer, size_t& bufferSize,
3037	const char* name)
3038{
3039	bool insertSlash = buffer[bufferSize] != '\0';
3040	size_t nameLength = strlen(name);
3041
3042	if (bufferSize < nameLength + (insertSlash ? 1 : 0))
3043		return false;
3044
3045	if (insertSlash)
3046		buffer[--bufferSize] = '/';
3047
3048	bufferSize -= nameLength;
3049	memcpy(buffer + bufferSize, name, nameLength);
3050
3051	return true;
3052}
3053
3054
3055static bool
3056debug_prepend_vnode_id_to_path(char* buffer, size_t& bufferSize, dev_t devID,
3057	ino_t nodeID)
3058{
3059	if (bufferSize == 0)
3060		return false;
3061
3062	bool insertSlash = buffer[bufferSize] != '\0';
3063	if (insertSlash)
3064		buffer[--bufferSize] = '/';
3065
3066	size_t size = snprintf(buffer, bufferSize,
3067		"<%" B_PRIdDEV ",%" B_PRIdINO ">", devID, nodeID);
3068	if (size > bufferSize) {
3069		if (insertSlash)
3070			bufferSize++;
3071		return false;
3072	}
3073
3074	if (size < bufferSize)
3075		memmove(buffer + bufferSize - size, buffer, size);
3076
3077	bufferSize -= size;
3078	return true;
3079}
3080
3081
3082static char*
3083debug_resolve_vnode_path(struct vnode* vnode, char* buffer, size_t bufferSize,
3084	bool& _truncated)
3085{
3086	// null-terminate the path
3087	buffer[--bufferSize] = '\0';
3088
3089	while (true) {
3090		while (vnode->covers != NULL)
3091			vnode = vnode->covers;
3092
3093		if (vnode == sRoot) {
3094			_truncated = bufferSize == 0;
3095			if (!_truncated)
3096				buffer[--bufferSize] = '/';
3097			return buffer + bufferSize;
3098		}
3099
3100		// resolve the name
3101		ino_t dirID;
3102		const char* name = vnode->mount->entry_cache.DebugReverseLookup(
3103			vnode->id, dirID);
3104		if (name == NULL) {
3105			// Failed to resolve the name -- prepend "<dev,node>/".
3106			_truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3107				vnode->mount->id, vnode->id);
3108			return buffer + bufferSize;
3109		}
3110
3111		// prepend the name
3112		if (!debug_prepend_vnode_name_to_path(buffer, bufferSize, name)) {
3113			_truncated = true;
3114			return buffer + bufferSize;
3115		}
3116
3117		// resolve the directory node
3118		struct vnode* nextVnode = lookup_vnode(vnode->mount->id, dirID);
3119		if (nextVnode == NULL) {
3120			_truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3121				vnode->mount->id, dirID);
3122			return buffer + bufferSize;
3123		}
3124
3125		vnode = nextVnode;
3126	}
3127}
3128
3129
3130static void
3131_dump_vnode(struct vnode* vnode, bool printPath)
3132{
3133	kprintf("VNODE: %p\n", vnode);
3134	kprintf(" device:        %" B_PRIdDEV "\n", vnode->device);
3135	kprintf(" id:            %" B_PRIdINO "\n", vnode->id);
3136	kprintf(" ref_count:     %" B_PRId32 "\n", vnode->ref_count);
3137	kprintf(" private_node:  %p\n", vnode->private_node);
3138	kprintf(" mount:         %p\n", vnode->mount);
3139	kprintf(" covered_by:    %p\n", vnode->covered_by);
3140	kprintf(" covers:        %p\n", vnode->covers);
3141	kprintf(" cache:         %p\n", vnode->cache);
3142	kprintf(" type:          %#" B_PRIx32 "\n", vnode->Type());
3143	kprintf(" flags:         %s%s%s\n", vnode->IsRemoved() ? "r" : "-",
3144		vnode->IsBusy() ? "b" : "-", vnode->IsUnpublished() ? "u" : "-");
3145	kprintf(" advisory_lock: %p\n", vnode->advisory_locking);
3146
3147	_dump_advisory_locking(vnode->advisory_locking);
3148
3149	if (printPath) {
3150		void* buffer = debug_malloc(B_PATH_NAME_LENGTH);
3151		if (buffer != NULL) {
3152			bool truncated;
3153			char* path = debug_resolve_vnode_path(vnode, (char*)buffer,
3154				B_PATH_NAME_LENGTH, truncated);
3155			if (path != NULL) {
3156				kprintf(" path:          ");
3157				if (truncated)
3158					kputs("<truncated>/");
3159				kputs(path);
3160				kputs("\n");
3161			} else
3162				kprintf("Failed to resolve vnode path.\n");
3163
3164			debug_free(buffer);
3165		} else
3166			kprintf("Failed to allocate memory for constructing the path.\n");
3167	}
3168
3169	set_debug_variable("_node", (addr_t)vnode->private_node);
3170	set_debug_variable("_mount", (addr_t)vnode->mount);
3171	set_debug_variable("_covered_by", (addr_t)vnode->covered_by);
3172	set_debug_variable("_covers", (addr_t)vnode->covers);
3173	set_debug_variable("_adv_lock", (addr_t)vnode->advisory_locking);
3174}
3175
3176
3177static int
3178dump_mount(int argc, char** argv)
3179{
3180	if (argc != 2 || !strcmp(argv[1], "--help")) {
3181		kprintf("usage: %s [id|address]\n", argv[0]);
3182		return 0;
3183	}
3184
3185	ulong val = parse_expression(argv[1]);
3186	uint32 id = val;
3187
3188	struct fs_mount* mount = sMountsTable->Lookup(id);
3189	if (mount == NULL) {
3190		if (IS_USER_ADDRESS(id)) {
3191			kprintf("fs_mount not found\n");
3192			return 0;
3193		}
3194		mount = (fs_mount*)val;
3195	}
3196
3197	_dump_mount(mount);
3198	return 0;
3199}
3200
3201
3202static int
3203dump_mounts(int argc, char** argv)
3204{
3205	if (argc != 1) {
3206		kprintf("usage: %s\n", argv[0]);
3207		return 0;
3208	}
3209
3210	kprintf("%-*s    id %-*s   %-*s   %-*s   fs_name\n",
3211		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "root",
3212		B_PRINTF_POINTER_WIDTH, "covers", B_PRINTF_POINTER_WIDTH, "cookie");
3213
3214	struct fs_mount* mount;
3215
3216	MountTable::Iterator iterator(sMountsTable);
3217	while (iterator.HasNext()) {
3218		mount = iterator.Next();
3219		kprintf("%p%4" B_PRIdDEV " %p %p %p %s\n", mount, mount->id, mount->root_vnode,
3220			mount->root_vnode->covers, mount->volume->private_volume,
3221			mount->volume->file_system_name);
3222
3223		fs_volume* volume = mount->volume;
3224		while (volume->super_volume != NULL) {
3225			volume = volume->super_volume;
3226			kprintf("                                     %p %s\n",
3227				volume->private_volume, volume->file_system_name);
3228		}
3229	}
3230
3231	return 0;
3232}
3233
3234
3235static int
3236dump_vnode(int argc, char** argv)
3237{
3238	bool printPath = false;
3239	int argi = 1;
3240	if (argc >= 2 && strcmp(argv[argi], "-p") == 0) {
3241		printPath = true;
3242		argi++;
3243	}
3244
3245	if (argi >= argc || argi + 2 < argc || strcmp(argv[argi], "--help") == 0) {
3246		print_debugger_command_usage(argv[0]);
3247		return 0;
3248	}
3249
3250	struct vnode* vnode = NULL;
3251
3252	if (argi + 1 == argc) {
3253		vnode = (struct vnode*)parse_expression(argv[argi]);
3254		if (IS_USER_ADDRESS(vnode)) {
3255			kprintf("invalid vnode address\n");
3256			return 0;
3257		}
3258		_dump_vnode(vnode, printPath);
3259		return 0;
3260	}
3261
3262	dev_t device = parse_expression(argv[argi]);
3263	ino_t id = parse_expression(argv[argi + 1]);
3264
3265	VnodeTable::Iterator iterator(sVnodeTable);
3266	while (iterator.HasNext()) {
3267		vnode = iterator.Next();
3268		if (vnode->id != id || vnode->device != device)
3269			continue;
3270
3271		_dump_vnode(vnode, printPath);
3272	}
3273
3274	return 0;
3275}
3276
3277
3278static int
3279dump_vnodes(int argc, char** argv)
3280{
3281	if (argc != 2 || !strcmp(argv[1], "--help")) {
3282		kprintf("usage: %s [device]\n", argv[0]);
3283		return 0;
3284	}
3285
3286	// restrict dumped nodes to a certain device if requested
3287	dev_t device = parse_expression(argv[1]);
3288
3289	struct vnode* vnode;
3290
3291	kprintf("%-*s   dev     inode  ref %-*s   %-*s   %-*s   flags\n",
3292		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache",
3293		B_PRINTF_POINTER_WIDTH, "fs-node", B_PRINTF_POINTER_WIDTH, "locking");
3294
3295	VnodeTable::Iterator iterator(sVnodeTable);
3296	while (iterator.HasNext()) {
3297		vnode = iterator.Next();
3298		if (vnode->device != device)
3299			continue;
3300
3301		kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO "%5" B_PRId32 " %p %p %p %s%s%s\n",
3302			vnode, vnode->device, vnode->id, vnode->ref_count, vnode->cache,
3303			vnode->private_node, vnode->advisory_locking,
3304			vnode->IsRemoved() ? "r" : "-", vnode->IsBusy() ? "b" : "-",
3305			vnode->IsUnpublished() ? "u" : "-");
3306	}
3307
3308	return 0;
3309}
3310
3311
3312static int
3313dump_vnode_caches(int argc, char** argv)
3314{
3315	struct vnode* vnode;
3316
3317	if (argc > 2 || !strcmp(argv[1], "--help")) {
3318		kprintf("usage: %s [device]\n", argv[0]);
3319		return 0;
3320	}
3321
3322	// restrict dumped nodes to a certain device if requested
3323	dev_t device = -1;
3324	if (argc > 1)
3325		device = parse_expression(argv[1]);
3326
3327	kprintf("%-*s   dev     inode %-*s       size   pages\n",
3328		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache");
3329
3330	VnodeTable::Iterator iterator(sVnodeTable);
3331	while (iterator.HasNext()) {
3332		vnode = iterator.Next();
3333		if (vnode->cache == NULL)
3334			continue;
3335		if (device != -1 && vnode->device != device)
3336			continue;
3337
3338		kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO " %p %8" B_PRIdOFF "%8" B_PRId32 "\n",
3339			vnode, vnode->device, vnode->id, vnode->cache,
3340			(vnode->cache->virtual_end + B_PAGE_SIZE - 1) / B_PAGE_SIZE,
3341			vnode->cache->page_count);
3342	}
3343
3344	return 0;
3345}
3346
3347
3348int
3349dump_io_context(int argc, char** argv)
3350{
3351	if (argc > 2 || !strcmp(argv[1], "--help")) {
3352		kprintf("usage: %s [team-id|address]\n", argv[0]);
3353		return 0;
3354	}
3355
3356	struct io_context* context = NULL;
3357
3358	if (argc > 1) {
3359		ulong num = parse_expression(argv[1]);
3360		if (IS_KERNEL_ADDRESS(num))
3361			context = (struct io_context*)num;
3362		else {
3363			Team* team = team_get_team_struct_locked(num);
3364			if (team == NULL) {
3365				kprintf("could not find team with ID %lu\n", num);
3366				return 0;
3367			}
3368			context = (struct io_context*)team->io_context;
3369		}
3370	} else
3371		context = get_current_io_context(true);
3372
3373	kprintf("I/O CONTEXT: %p\n", context);
3374	kprintf(" root vnode:\t%p\n", context->root);
3375	kprintf(" cwd vnode:\t%p\n", context->cwd);
3376	kprintf(" used fds:\t%" B_PRIu32 "\n", context->num_used_fds);
3377	kprintf(" max fds:\t%" B_PRIu32 "\n", context->table_size);
3378
3379	if (context->num_used_fds) {
3380		kprintf("   no.    %*s  ref  open  mode         pos    %*s\n",
3381			B_PRINTF_POINTER_WIDTH, "ops", B_PRINTF_POINTER_WIDTH, "cookie");
3382	}
3383
3384	for (uint32 i = 0; i < context->table_size; i++) {
3385		struct file_descriptor* fd = context->fds[i];
3386		if (fd == NULL)
3387			continue;
3388
3389		kprintf("  %3" B_PRIu32 ":  %p  %3" B_PRId32 "  %4"
3390			B_PRIu32 "  %4" B_PRIx32 "  %10" B_PRIdOFF "  %p  %s %p\n", i,
3391			fd->ops, fd->ref_count, fd->open_count, fd->open_mode,
3392			fd->pos, fd->cookie,
3393			(fd_vnode(fd) != NULL) ? "vnode" : "mount",
3394			fd->u.vnode);
3395	}
3396
3397	kprintf(" used monitors:\t%" B_PRIu32 "\n", context->num_monitors);
3398	kprintf(" max monitors:\t%" B_PRIu32 "\n", context->max_monitors);
3399
3400	set_debug_variable("_cwd", (addr_t)context->cwd);
3401
3402	return 0;
3403}
3404
3405
3406int
3407dump_vnode_usage(int argc, char** argv)
3408{
3409	if (argc != 1) {
3410		kprintf("usage: %s\n", argv[0]);
3411		return 0;
3412	}
3413
3414	kprintf("Unused vnodes: %" B_PRIu32 " (max unused %" B_PRIu32 ")\n",
3415		sUnusedVnodes, kMaxUnusedVnodes);
3416
3417	uint32 count = sVnodeTable->CountElements();
3418
3419	kprintf("%" B_PRIu32 " vnodes total (%" B_PRIu32 " in use).\n", count,
3420		count - sUnusedVnodes);
3421	return 0;
3422}
3423
3424#endif	// ADD_DEBUGGER_COMMANDS
3425
3426
3427/*!	Clears memory specified by an iovec array.
3428*/
3429static void
3430zero_iovecs(const iovec* vecs, size_t vecCount, size_t bytes)
3431{
3432	for (size_t i = 0; i < vecCount && bytes > 0; i++) {
3433		size_t length = std::min(vecs[i].iov_len, bytes);
3434		memset(vecs[i].iov_base, 0, length);
3435		bytes -= length;
3436	}
3437}
3438
3439
3440/*!	Does the dirty work of combining the file_io_vecs with the iovecs
3441	and calls the file system hooks to read/write the request to disk.
3442*/
3443static status_t
3444common_file_io_vec_pages(struct vnode* vnode, void* cookie,
3445	const file_io_vec* fileVecs, size_t fileVecCount, const iovec* vecs,
3446	size_t vecCount, uint32* _vecIndex, size_t* _vecOffset, size_t* _numBytes,
3447	bool doWrite)
3448{
3449	if (fileVecCount == 0) {
3450		// There are no file vecs at this offset, so we're obviously trying
3451		// to access the file outside of its bounds
3452		return B_BAD_VALUE;
3453	}
3454
3455	size_t numBytes = *_numBytes;
3456	uint32 fileVecIndex;
3457	size_t vecOffset = *_vecOffset;
3458	uint32 vecIndex = *_vecIndex;
3459	status_t status;
3460	size_t size;
3461
3462	if (!doWrite && vecOffset == 0) {
3463		// now directly read the data from the device
3464		// the first file_io_vec can be read directly
3465		// TODO: we could also write directly
3466
3467		if (fileVecs[0].length < (off_t)numBytes)
3468			size = fileVecs[0].length;
3469		else
3470			size = numBytes;
3471
3472		if (fileVecs[0].offset >= 0) {
3473			status = FS_CALL(vnode, read_pages, cookie, fileVecs[0].offset,
3474				&vecs[vecIndex], vecCount - vecIndex, &size);
3475		} else {
3476			// sparse read
3477			zero_iovecs(&vecs[vecIndex], vecCount - vecIndex, size);
3478			status = B_OK;
3479		}
3480		if (status != B_OK)
3481			return status;
3482
3483		ASSERT((off_t)size <= fileVecs[0].length);
3484
3485		// If the file portion was contiguous, we're already done now
3486		if (size == numBytes)
3487			return B_OK;
3488
3489		// if we reached the end of the file, we can return as well
3490		if ((off_t)size != fileVecs[0].length) {
3491			*_numBytes = size;
3492			return B_OK;
3493		}
3494
3495		fileVecIndex = 1;
3496
3497		// first, find out where we have to continue in our iovecs
3498		for (; vecIndex < vecCount; vecIndex++) {
3499			if (size < vecs[vecIndex].iov_len)
3500				break;
3501
3502			size -= vecs[vecIndex].iov_len;
3503		}
3504
3505		vecOffset = size;
3506	} else {
3507		fileVecIndex = 0;
3508		size = 0;
3509	}
3510
3511	// Too bad, let's process the rest of the file_io_vecs
3512
3513	size_t totalSize = size;
3514	size_t bytesLeft = numBytes - size;
3515
3516	for (; fileVecIndex < fileVecCount; fileVecIndex++) {
3517		const file_io_vec &fileVec = fileVecs[fileVecIndex];
3518		off_t fileOffset = fileVec.offset;
3519		off_t fileLeft = min_c(fileVec.length, (off_t)bytesLeft);
3520
3521		TRACE(("FILE VEC [%" B_PRIu32 "] length %" B_PRIdOFF "\n", fileVecIndex,
3522			fileLeft));
3523
3524		// process the complete fileVec
3525		while (fileLeft > 0) {
3526			iovec tempVecs[MAX_TEMP_IO_VECS];
3527			uint32 tempCount = 0;
3528
3529			// size tracks how much of what is left of the current fileVec
3530			// (fileLeft) has been assigned to tempVecs
3531			size = 0;
3532
3533			// assign what is left of the current fileVec to the tempVecs
3534			for (size = 0; (off_t)size < fileLeft && vecIndex < vecCount
3535					&& tempCount < MAX_TEMP_IO_VECS;) {
3536				// try to satisfy one iovec per iteration (or as much as
3537				// possible)
3538
3539				// bytes left of the current iovec
3540				size_t vecLeft = vecs[vecIndex].iov_len - vecOffset;
3541				if (vecLeft == 0) {
3542					vecOffset = 0;
3543					vecIndex++;
3544					continue;
3545				}
3546
3547				TRACE(("fill vec %" B_PRIu32 ", offset = %lu, size = %lu\n",
3548					vecIndex, vecOffset, size));
3549
3550				// actually available bytes
3551				size_t tempVecSize = min_c(vecLeft, fileLeft - size);
3552
3553				tempVecs[tempCount].iov_base
3554					= (void*)((addr_t)vecs[vecIndex].iov_base + vecOffset);
3555				tempVecs[tempCount].iov_len = tempVecSize;
3556				tempCount++;
3557
3558				size += tempVecSize;
3559				vecOffset += tempVecSize;
3560			}
3561
3562			size_t bytes = size;
3563
3564			if (fileOffset == -1) {
3565				if (doWrite) {
3566					panic("sparse write attempt: vnode %p", vnode);
3567					status = B_IO_ERROR;
3568				} else {
3569					// sparse read
3570					zero_iovecs(tempVecs, tempCount, bytes);
3571					status = B_OK;
3572				}
3573			} else if (doWrite) {
3574				status = FS_CALL(vnode, write_pages, cookie, fileOffset,
3575					tempVecs, tempCount, &bytes);
3576			} else {
3577				status = FS_CALL(vnode, read_pages, cookie, fileOffset,
3578					tempVecs, tempCount, &bytes);
3579			}
3580			if (status != B_OK)
3581				return status;
3582
3583			totalSize += bytes;
3584			bytesLeft -= size;
3585			if (fileOffset >= 0)
3586				fileOffset += size;
3587			fileLeft -= size;
3588			//dprintf("-> file left = %Lu\n", fileLeft);
3589
3590			if (size != bytes || vecIndex >= vecCount) {
3591				// there are no more bytes or iovecs, let's bail out
3592				*_numBytes = totalSize;
3593				return B_OK;
3594			}
3595		}
3596	}
3597
3598	*_vecIndex = vecIndex;
3599	*_vecOffset = vecOffset;
3600	*_numBytes = totalSize;
3601	return B_OK;
3602}
3603
3604
3605static bool
3606is_user_in_group(gid_t gid)
3607{
3608	if (gid == getegid())
3609		return true;
3610
3611	gid_t groups[NGROUPS_MAX];
3612	int groupCount = getgroups(NGROUPS_MAX, groups);
3613	for (int i = 0; i < groupCount; i++) {
3614		if (gid == groups[i])
3615			return true;
3616	}
3617
3618	return false;
3619}
3620
3621
3622static status_t
3623free_io_context(io_context* context)
3624{
3625	uint32 i;
3626
3627	TIOC(FreeIOContext(context));
3628
3629	if (context->root)
3630		put_vnode(context->root);
3631
3632	if (context->cwd)
3633		put_vnode(context->cwd);
3634
3635	mutex_lock(&context->io_mutex);
3636
3637	for (i = 0; i < context->table_size; i++) {
3638		if (struct file_descriptor* descriptor = context->fds[i]) {
3639			close_fd(context, descriptor);
3640			put_fd(descriptor);
3641		}
3642	}
3643
3644	mutex_destroy(&context->io_mutex);
3645
3646	remove_node_monitors(context);
3647	free(context->fds);
3648	free(context);
3649
3650	return B_OK;
3651}
3652
3653
3654static status_t
3655resize_monitor_table(struct io_context* context, const int newSize)
3656{
3657	int	status = B_OK;
3658
3659	if (newSize <= 0 || newSize > MAX_NODE_MONITORS)
3660		return B_BAD_VALUE;
3661
3662	mutex_lock(&context->io_mutex);
3663
3664	if ((size_t)newSize < context->num_monitors) {
3665		status = B_BUSY;
3666		goto out;
3667	}
3668	context->max_monitors = newSize;
3669
3670out:
3671	mutex_unlock(&context->io_mutex);
3672	return status;
3673}
3674
3675
3676//	#pragma mark - public API for file systems
3677
3678
3679extern "C" status_t
3680new_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3681	fs_vnode_ops* ops)
3682{
3683	FUNCTION(("new_vnode(volume = %p (%" B_PRId32 "), vnodeID = %" B_PRId64
3684		", node = %p)\n", volume, volume->id, vnodeID, privateNode));
3685
3686	if (privateNode == NULL)
3687		return B_BAD_VALUE;
3688
3689	int32 tries = BUSY_VNODE_RETRIES;
3690restart:
3691	// create the node
3692	bool nodeCreated;
3693	struct vnode* vnode;
3694	status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3695		nodeCreated);
3696	if (status != B_OK)
3697		return status;
3698
3699	WriteLocker nodeLocker(sVnodeLock, true);
3700		// create_new_vnode_and_lock() has locked for us
3701
3702	if (!nodeCreated && vnode->IsBusy()) {
3703		nodeLocker.Unlock();
3704		if (!retry_busy_vnode(tries, volume->id, vnodeID))
3705			return B_BUSY;
3706		goto restart;
3707	}
3708
3709	// file system integrity check:
3710	// test if the vnode already exists and bail out if this is the case!
3711	if (!nodeCreated) {
3712		panic("vnode %" B_PRIdDEV ":%" B_PRIdINO " already exists (node = %p, "
3713			"vnode->node = %p)!", volume->id, vnodeID, privateNode,
3714			vnode->private_node);
3715		return B_ERROR;
3716	}
3717
3718	vnode->private_node = privateNode;
3719	vnode->ops = ops;
3720	vnode->SetUnpublished(true);
3721
3722	TRACE(("returns: %s\n", strerror(status)));
3723
3724	return status;
3725}
3726
3727
3728extern "C" status_t
3729publish_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3730	fs_vnode_ops* ops, int type, uint32 flags)
3731{
3732	FUNCTION(("publish_vnode()\n"));
3733
3734	int32 tries = BUSY_VNODE_RETRIES;
3735restart:
3736	WriteLocker locker(sVnodeLock);
3737
3738	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3739
3740	bool nodeCreated = false;
3741	if (vnode == NULL) {
3742		if (privateNode == NULL)
3743			return B_BAD_VALUE;
3744
3745		// create the node
3746		locker.Unlock();
3747			// create_new_vnode_and_lock() will re-lock for us on success
3748		status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3749			nodeCreated);
3750		if (status != B_OK)
3751			return status;
3752
3753		locker.SetTo(sVnodeLock, true);
3754	}
3755
3756	if (nodeCreated) {
3757		vnode->private_node = privateNode;
3758		vnode->ops = ops;
3759		vnode->SetUnpublished(true);
3760	} else if (vnode->IsBusy() && vnode->IsUnpublished()
3761		&& vnode->private_node == privateNode && vnode->ops == ops) {
3762		// already known, but not published
3763	} else if (vnode->IsBusy()) {
3764		locker.Unlock();
3765		if (!retry_busy_vnode(tries, volume->id, vnodeID))
3766			return B_BUSY;
3767		goto restart;
3768	} else
3769		return B_BAD_VALUE;
3770
3771	bool publishSpecialSubNode = false;
3772
3773	vnode->SetType(type);
3774	vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
3775	publishSpecialSubNode = is_special_node_type(type)
3776		&& (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
3777
3778	status_t status = B_OK;
3779
3780	// create sub vnodes, if necessary
3781	if (volume->sub_volume != NULL || publishSpecialSubNode) {
3782		locker.Unlock();
3783
3784		fs_volume* subVolume = volume;
3785		if (volume->sub_volume != NULL) {
3786			while (status == B_OK && subVolume->sub_volume != NULL) {
3787				subVolume = subVolume->sub_volume;
3788				status = subVolume->ops->create_sub_vnode(subVolume, vnodeID,
3789					vnode);
3790			}
3791		}
3792
3793		if (status == B_OK && publishSpecialSubNode)
3794			status = create_special_sub_node(vnode, flags);
3795
3796		if (status != B_OK) {
3797			// error -- clean up the created sub vnodes
3798			while (subVolume->super_volume != volume) {
3799				subVolume = subVolume->super_volume;
3800				subVolume->ops->delete_sub_vnode(subVolume, vnode);
3801			}
3802		}
3803
3804		if (status == B_OK) {
3805			ReadLocker vnodesReadLocker(sVnodeLock);
3806			AutoLocker<Vnode> nodeLocker(vnode);
3807			vnode->SetBusy(false);
3808			vnode->SetUnpublished(false);
3809		} else {
3810			locker.Lock();
3811			sVnodeTable->Remove(vnode);
3812			remove_vnode_from_mount_list(vnode, vnode->mount);
3813			object_cache_free(sVnodeCache, vnode, 0);
3814		}
3815	} else {
3816		// we still hold the write lock -- mark the node unbusy and published
3817		vnode->SetBusy(false);
3818		vnode->SetUnpublished(false);
3819	}
3820
3821	TRACE(("returns: %s\n", strerror(status)));
3822
3823	return status;
3824}
3825
3826
3827extern "C" status_t
3828get_vnode(fs_volume* volume, ino_t vnodeID, void** _privateNode)
3829{
3830	struct vnode* vnode;
3831
3832	if (volume == NULL)
3833		return B_BAD_VALUE;
3834
3835	status_t status = get_vnode(volume->id, vnodeID, &vnode, true, true);
3836	if (status != B_OK)
3837		return status;
3838
3839	// If this is a layered FS, we need to get the node cookie for the requested
3840	// layer.
3841	if (HAS_FS_CALL(vnode, get_super_vnode)) {
3842		fs_vnode resolvedNode;
3843		status_t status = FS_CALL(vnode, get_super_vnode, volume,
3844			&resolvedNode);
3845		if (status != B_OK) {
3846			panic("get_vnode(): Failed to get super node for vnode %p, "
3847				"volume: %p", vnode, volume);
3848			put_vnode(vnode);
3849			return status;
3850		}
3851
3852		if (_privateNode != NULL)
3853			*_privateNode = resolvedNode.private_node;
3854	} else if (_privateNode != NULL)
3855		*_privateNode = vnode->private_node;
3856
3857	return B_OK;
3858}
3859
3860
3861extern "C" status_t
3862acquire_vnode(fs_volume* volume, ino_t vnodeID)
3863{
3864	ReadLocker nodeLocker(sVnodeLock);
3865
3866	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3867	if (vnode == NULL)
3868		return B_BAD_VALUE;
3869
3870	inc_vnode_ref_count(vnode);
3871	return B_OK;
3872}
3873
3874
3875extern "C" status_t
3876put_vnode(fs_volume* volume, ino_t vnodeID)
3877{
3878	struct vnode* vnode;
3879
3880	rw_lock_read_lock(&sVnodeLock);
3881	vnode = lookup_vnode(volume->id, vnodeID);
3882	rw_lock_read_unlock(&sVnodeLock);
3883
3884	if (vnode == NULL)
3885		return B_BAD_VALUE;
3886
3887	dec_vnode_ref_count(vnode, false, true);
3888	return B_OK;
3889}
3890
3891
3892extern "C" status_t
3893remove_vnode(fs_volume* volume, ino_t vnodeID)
3894{
3895	ReadLocker locker(sVnodeLock);
3896
3897	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3898	if (vnode == NULL)
3899		return B_ENTRY_NOT_FOUND;
3900
3901	if (vnode->covered_by != NULL || vnode->covers != NULL) {
3902		// this vnode is in use
3903		return B_BUSY;
3904	}
3905
3906	vnode->Lock();
3907
3908	vnode->SetRemoved(true);
3909	bool removeUnpublished = false;
3910
3911	if (vnode->IsUnpublished()) {
3912		// prepare the vnode for deletion
3913		removeUnpublished = true;
3914		vnode->SetBusy(true);
3915	}
3916
3917	vnode->Unlock();
3918	locker.Unlock();
3919
3920	if (removeUnpublished) {
3921		// If the vnode hasn't been published yet, we delete it here
3922		atomic_add(&vnode->ref_count, -1);
3923		free_vnode(vnode, true);
3924	}
3925
3926	return B_OK;
3927}
3928
3929
3930extern "C" status_t
3931unremove_vnode(fs_volume* volume, ino_t vnodeID)
3932{
3933	struct vnode* vnode;
3934
3935	rw_lock_read_lock(&sVnodeLock);
3936
3937	vnode = lookup_vnode(volume->id, vnodeID);
3938	if (vnode) {
3939		AutoLocker<Vnode> nodeLocker(vnode);
3940		vnode->SetRemoved(false);
3941	}
3942
3943	rw_lock_read_unlock(&sVnodeLock);
3944	return B_OK;
3945}
3946
3947
3948extern "C" status_t
3949get_vnode_removed(fs_volume* volume, ino_t vnodeID, bool* _removed)
3950{
3951	ReadLocker _(sVnodeLock);
3952
3953	if (struct vnode* vnode = lookup_vnode(volume->id, vnodeID)) {
3954		if (_removed != NULL)
3955			*_removed = vnode->IsRemoved();
3956		return B_OK;
3957	}
3958
3959	return B_BAD_VALUE;
3960}
3961
3962
3963extern "C" fs_volume*
3964volume_for_vnode(fs_vnode* _vnode)
3965{
3966	if (_vnode == NULL)
3967		return NULL;
3968
3969	struct vnode* vnode = static_cast<struct vnode*>(_vnode);
3970	return vnode->mount->volume;
3971}
3972
3973
3974extern "C" status_t
3975check_access_permissions(int accessMode, mode_t mode, gid_t nodeGroupID,
3976	uid_t nodeUserID)
3977{
3978	// get node permissions
3979	int userPermissions = (mode & S_IRWXU) >> 6;
3980	int groupPermissions = (mode & S_IRWXG) >> 3;
3981	int otherPermissions = mode & S_IRWXO;
3982
3983	// get the node permissions for this uid/gid
3984	int permissions = 0;
3985	uid_t uid = geteuid();
3986
3987	if (uid == 0) {
3988		// user is root
3989		// root has always read/write permission, but at least one of the
3990		// X bits must be set for execute permission
3991		permissions = userPermissions | groupPermissions | otherPermissions
3992			| S_IROTH | S_IWOTH;
3993		if (S_ISDIR(mode))
3994			permissions |= S_IXOTH;
3995	} else if (uid == nodeUserID) {
3996		// user is node owner
3997		permissions = userPermissions;
3998	} else if (is_user_in_group(nodeGroupID)) {
3999		// user is in owning group
4000		permissions = groupPermissions;
4001	} else {
4002		// user is one of the others
4003		permissions = otherPermissions;
4004	}
4005
4006	return (accessMode & ~permissions) == 0 ? B_OK : B_PERMISSION_DENIED;
4007}
4008
4009
4010#if 0
4011extern "C" status_t
4012read_pages(int fd, off_t pos, const iovec* vecs, size_t count,
4013	size_t* _numBytes)
4014{
4015	struct file_descriptor* descriptor;
4016	struct vnode* vnode;
4017
4018	descriptor = get_fd_and_vnode(fd, &vnode, true);
4019	if (descriptor == NULL)
4020		return B_FILE_ERROR;
4021
4022	status_t status = vfs_read_pages(vnode, descriptor->cookie, pos, vecs,
4023		count, 0, _numBytes);
4024
4025	put_fd(descriptor);
4026	return status;
4027}
4028
4029
4030extern "C" status_t
4031write_pages(int fd, off_t pos, const iovec* vecs, size_t count,
4032	size_t* _numBytes)
4033{
4034	struct file_descriptor* descriptor;
4035	struct vnode* vnode;
4036
4037	descriptor = get_fd_and_vnode(fd, &vnode, true);
4038	if (descriptor == NULL)
4039		return B_FILE_ERROR;
4040
4041	status_t status = vfs_write_pages(vnode, descriptor->cookie, pos, vecs,
4042		count, 0, _numBytes);
4043
4044	put_fd(descriptor);
4045	return status;
4046}
4047#endif
4048
4049
4050extern "C" status_t
4051read_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
4052	const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
4053	size_t* _bytes)
4054{
4055	struct vnode* vnode;
4056	FileDescriptorPutter descriptor(get_fd_and_vnode(fd, &vnode, true));
4057	if (!descriptor.IsSet())
4058		return B_FILE_ERROR;
4059
4060	status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
4061		fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
4062		false);
4063
4064	return status;
4065}
4066
4067
4068extern "C" status_t
4069write_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
4070	const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
4071	size_t* _bytes)
4072{
4073	struct vnode* vnode;
4074	FileDescriptorPutter descriptor(get_fd_and_vnode(fd, &vnode, true));
4075	if (!descriptor.IsSet())
4076		return B_FILE_ERROR;
4077
4078	status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
4079		fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
4080		true);
4081
4082	return status;
4083}
4084
4085
4086extern "C" status_t
4087entry_cache_add(dev_t mountID, ino_t dirID, const char* name, ino_t nodeID)
4088{
4089	// lookup mount -- the caller is required to make sure that the mount
4090	// won't go away
4091	ReadLocker locker(sMountLock);
4092	struct fs_mount* mount = find_mount(mountID);
4093	if (mount == NULL)
4094		return B_BAD_VALUE;
4095	locker.Unlock();
4096
4097	return mount->entry_cache.Add(dirID, name, nodeID, false);
4098}
4099
4100
4101extern "C" status_t
4102entry_cache_add_missing(dev_t mountID, ino_t dirID, const char* name)
4103{
4104	// lookup mount -- the caller is required to make sure that the mount
4105	// won't go away
4106	ReadLocker locker(sMountLock);
4107	struct fs_mount* mount = find_mount(mountID);
4108	if (mount == NULL)
4109		return B_BAD_VALUE;
4110	locker.Unlock();
4111
4112	return mount->entry_cache.Add(dirID, name, -1, true);
4113}
4114
4115
4116extern "C" status_t
4117entry_cache_remove(dev_t mountID, ino_t dirID, const char* name)
4118{
4119	// lookup mount -- the caller is required to make sure that the mount
4120	// won't go away
4121	ReadLocker locker(sMountLock);
4122	struct fs_mount* mount = find_mount(mountID);
4123	if (mount == NULL)
4124		return B_BAD_VALUE;
4125	locker.Unlock();
4126
4127	return mount->entry_cache.Remove(dirID, name);
4128}
4129
4130
4131//	#pragma mark - private VFS API
4132//	Functions the VFS exports for other parts of the kernel
4133
4134
4135/*! Acquires another reference to the vnode that has to be released
4136	by calling vfs_put_vnode().
4137*/
4138void
4139vfs_acquire_vnode(struct vnode* vnode)
4140{
4141	inc_vnode_ref_count(vnode);
4142}
4143
4144
4145/*! This is currently called from file_cache_create() only.
4146	It's probably a temporary solution as long as devfs requires that
4147	fs_read_pages()/fs_write_pages() are called with the standard
4148	open cookie and not with a device cookie.
4149	If that's done differently, remove this call; it has no other
4150	purpose.
4151*/
4152extern "C" status_t
4153vfs_get_cookie_from_fd(int fd, void** _cookie)
4154{
4155	struct file_descriptor* descriptor;
4156
4157	descriptor = get_fd(get_current_io_context(true), fd);
4158	if (descriptor == NULL)
4159		return B_FILE_ERROR;
4160
4161	*_cookie = descriptor->cookie;
4162	return B_OK;
4163}
4164
4165
4166extern "C" status_t
4167vfs_get_vnode_from_fd(int fd, bool kernel, struct vnode** vnode)
4168{
4169	*vnode = get_vnode_from_fd(fd, kernel);
4170
4171	if (*vnode == NULL)
4172		return B_FILE_ERROR;
4173
4174	return B_NO_ERROR;
4175}
4176
4177
4178extern "C" status_t
4179vfs_get_vnode_from_path(const char* path, bool kernel, struct vnode** _vnode)
4180{
4181	TRACE(("vfs_get_vnode_from_path: entry. path = '%s', kernel %d\n",
4182		path, kernel));
4183
4184	KPath pathBuffer;
4185	if (pathBuffer.InitCheck() != B_OK)
4186		return B_NO_MEMORY;
4187
4188	char* buffer = pathBuffer.LockBuffer();
4189	strlcpy(buffer, path, pathBuffer.BufferSize());
4190
4191	VnodePutter vnode;
4192	status_t status = path_to_vnode(buffer, true, vnode, NULL, kernel);
4193	if (status != B_OK)
4194		return status;
4195
4196	*_vnode = vnode.Detach();
4197	return B_OK;
4198}
4199
4200
4201extern "C" status_t
4202vfs_get_vnode(dev_t mountID, ino_t vnodeID, bool canWait, struct vnode** _vnode)
4203{
4204	struct vnode* vnode = NULL;
4205
4206	status_t status = get_vnode(mountID, vnodeID, &vnode, canWait, false);
4207	if (status != B_OK)
4208		return status;
4209
4210	*_vnode = vnode;
4211	return B_OK;
4212}
4213
4214
4215extern "C" status_t
4216vfs_entry_ref_to_vnode(dev_t mountID, ino_t directoryID,
4217	const char* name, struct vnode** _vnode)
4218{
4219	VnodePutter vnode;
4220	status_t status = entry_ref_to_vnode(mountID, directoryID, name, false, true, vnode);
4221	*_vnode = vnode.Detach();
4222	return status;
4223}
4224
4225
4226extern "C" void
4227vfs_vnode_to_node_ref(struct vnode* vnode, dev_t* _mountID, ino_t* _vnodeID)
4228{
4229	*_mountID = vnode->device;
4230	*_vnodeID = vnode->id;
4231}
4232
4233
4234/*!
4235	Helper function abstracting the process of "converting" a given
4236	vnode-pointer to a fs_vnode-pointer.
4237	Currently only used in bindfs.
4238*/
4239extern "C" fs_vnode*
4240vfs_fsnode_for_vnode(struct vnode* vnode)
4241{
4242	return vnode;
4243}
4244
4245
4246/*!
4247	Calls fs_open() on the given vnode and returns a new
4248	file descriptor for it
4249*/
4250int
4251vfs_open_vnode(struct vnode* vnode, int openMode, bool kernel)
4252{
4253	return open_vnode(vnode, openMode, kernel);
4254}
4255
4256
4257/*!	Looks up a vnode with the given mount and vnode ID.
4258	Must only be used with "in-use" vnodes as it doesn't grab a reference
4259	to the node.
4260	It's currently only be used by file_cache_create().
4261*/
4262extern "C" status_t
4263vfs_lookup_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode)
4264{
4265	rw_lock_read_lock(&sVnodeLock);
4266	struct vnode* vnode = lookup_vnode(mountID, vnodeID);
4267	rw_lock_read_unlock(&sVnodeLock);
4268
4269	if (vnode == NULL)
4270		return B_ERROR;
4271
4272	*_vnode = vnode;
4273	return B_OK;
4274}
4275
4276
4277extern "C" status_t
4278vfs_get_fs_node_from_path(fs_volume* volume, const char* path,
4279	bool traverseLeafLink, bool kernel, void** _node)
4280{
4281	TRACE(("vfs_get_fs_node_from_path(volume = %p, path = \"%s\", kernel %d)\n",
4282		volume, path, kernel));
4283
4284	KPath pathBuffer;
4285	if (pathBuffer.InitCheck() != B_OK)
4286		return B_NO_MEMORY;
4287
4288	fs_mount* mount;
4289	status_t status = get_mount(volume->id, &mount);
4290	if (status != B_OK)
4291		return status;
4292
4293	char* buffer = pathBuffer.LockBuffer();
4294	strlcpy(buffer, path, pathBuffer.BufferSize());
4295
4296	VnodePutter vnode;
4297
4298	if (buffer[0] == '/')
4299		status = path_to_vnode(buffer, traverseLeafLink, vnode, NULL, kernel);
4300	else {
4301		inc_vnode_ref_count(mount->root_vnode);
4302			// vnode_path_to_vnode() releases a reference to the starting vnode
4303		status = vnode_path_to_vnode(mount->root_vnode, buffer, traverseLeafLink,
4304			kernel, vnode, NULL);
4305	}
4306
4307	put_mount(mount);
4308
4309	if (status != B_OK)
4310		return status;
4311
4312	if (vnode->device != volume->id) {
4313		// wrong mount ID - must not gain access on foreign file system nodes
4314		return B_BAD_VALUE;
4315	}
4316
4317	// Use get_vnode() to resolve the cookie for the right layer.
4318	status = get_vnode(volume, vnode->id, _node);
4319
4320	return status;
4321}
4322
4323
4324status_t
4325vfs_read_stat(int fd, const char* path, bool traverseLeafLink,
4326	struct stat* stat, bool kernel)
4327{
4328	status_t status;
4329
4330	if (path != NULL) {
4331		// path given: get the stat of the node referred to by (fd, path)
4332		KPath pathBuffer(path);
4333		if (pathBuffer.InitCheck() != B_OK)
4334			return B_NO_MEMORY;
4335
4336		status = common_path_read_stat(fd, pathBuffer.LockBuffer(),
4337			traverseLeafLink, stat, kernel);
4338	} else {
4339		// no path given: get the FD and use the FD operation
4340		FileDescriptorPutter descriptor
4341			(get_fd(get_current_io_context(kernel), fd));
4342		if (!descriptor.IsSet())
4343			return B_FILE_ERROR;
4344
4345		if (descriptor->ops->fd_read_stat)
4346			status = descriptor->ops->fd_read_stat(descriptor.Get(), stat);
4347		else
4348			status = B_UNSUPPORTED;
4349	}
4350
4351	return status;
4352}
4353
4354
4355/*!	Finds the full path to the file that contains the module \a moduleName,
4356	puts it into \a pathBuffer, and returns B_OK for success.
4357	If \a pathBuffer was too small, it returns \c B_BUFFER_OVERFLOW,
4358	\c B_ENTRY_NOT_FOUNT if no file could be found.
4359	\a pathBuffer is clobbered in any case and must not be relied on if this
4360	functions returns unsuccessfully.
4361	\a basePath and \a pathBuffer must not point to the same space.
4362*/
4363status_t
4364vfs_get_module_path(const char* basePath, const char* moduleName,
4365	char* pathBuffer, size_t bufferSize)
4366{
4367	status_t status;
4368	size_t length;
4369	char* path;
4370
4371	if (bufferSize == 0
4372		|| strlcpy(pathBuffer, basePath, bufferSize) >= bufferSize)
4373		return B_BUFFER_OVERFLOW;
4374
4375	VnodePutter dir;
4376	status = path_to_vnode(pathBuffer, true, dir, NULL, true);
4377	if (status != B_OK)
4378		return status;
4379
4380	// the path buffer had been clobbered by the above call
4381	length = strlcpy(pathBuffer, basePath, bufferSize);
4382	if (pathBuffer[length - 1] != '/')
4383		pathBuffer[length++] = '/';
4384
4385	path = pathBuffer + length;
4386	bufferSize -= length;
4387
4388	VnodePutter file;
4389	while (moduleName) {
4390		char* nextPath = strchr(moduleName, '/');
4391		if (nextPath == NULL)
4392			length = strlen(moduleName);
4393		else {
4394			length = nextPath - moduleName;
4395			nextPath++;
4396		}
4397
4398		if (length + 1 >= bufferSize)
4399			return B_BUFFER_OVERFLOW;
4400
4401		memcpy(path, moduleName, length);
4402		path[length] = '\0';
4403		moduleName = nextPath;
4404
4405		// vnode_path_to_vnode() assumes ownership of the passed dir
4406		status = vnode_path_to_vnode(dir.Detach(), path, true, true, file, NULL);
4407		if (status != B_OK)
4408			return status;
4409
4410		if (S_ISDIR(file->Type())) {
4411			// goto the next directory
4412			path[length] = '/';
4413			path[length + 1] = '\0';
4414			path += length + 1;
4415			bufferSize -= length + 1;
4416
4417			dir.SetTo(file.Detach());
4418		} else if (S_ISREG(file->Type())) {
4419			// it's a file so it should be what we've searched for
4420			return B_OK;
4421		} else {
4422			TRACE(("vfs_get_module_path(): something is strange here: "
4423				"0x%08" B_PRIx32 "...\n", file->Type()));
4424			return B_ERROR;
4425		}
4426	}
4427
4428	// if we got here, the moduleName just pointed to a directory, not to
4429	// a real module - what should we do in this case?
4430	return B_ENTRY_NOT_FOUND;
4431}
4432
4433
4434/*!	\brief Normalizes a given path.
4435
4436	The path must refer to an existing or non-existing entry in an existing
4437	directory, that is chopping off the leaf component the remaining path must
4438	refer to an existing directory.
4439
4440	The returned will be canonical in that it will be absolute, will not
4441	contain any "." or ".." components or duplicate occurrences of '/'s,
4442	and none of the directory components will by symbolic links.
4443
4444	Any two paths referring to the same entry, will result in the same
4445	normalized path (well, that is pretty much the definition of `normalized',
4446	isn't it :-).
4447
4448	\param path The path to be normalized.
4449	\param buffer The buffer into which the normalized path will be written.
4450		   May be the same one as \a path.
4451	\param bufferSize The size of \a buffer.
4452	\param traverseLink If \c true, the function also resolves leaf symlinks.
4453	\param kernel \c true, if the IO context of the kernel shall be used,
4454		   otherwise that of the team this thread belongs to. Only relevant,
4455		   if the path is relative (to get the CWD).
4456	\return \c B_OK if everything went fine, another error code otherwise.
4457*/
4458status_t
4459vfs_normalize_path(const char* path, char* buffer, size_t bufferSize,
4460	bool traverseLink, bool kernel)
4461{
4462	if (!path || !buffer || bufferSize < 1)
4463		return B_BAD_VALUE;
4464
4465	if (path != buffer) {
4466		if (strlcpy(buffer, path, bufferSize) >= bufferSize)
4467			return B_BUFFER_OVERFLOW;
4468	}
4469
4470	return normalize_path(buffer, bufferSize, traverseLink, kernel);
4471}
4472
4473
4474/*!	\brief Gets the parent of the passed in node.
4475
4476	Gets the parent of the passed in node, and correctly resolves covered
4477	nodes.
4478*/
4479extern "C" status_t
4480vfs_resolve_parent(struct vnode* parent, dev_t* device, ino_t* node)
4481{
4482	return resolve_covered_parent(parent, device, node,
4483		get_current_io_context(true));
4484}
4485
4486
4487/*!	\brief Creates a special node in the file system.
4488
4489	The caller gets a reference to the newly created node (which is passed
4490	back through \a _createdVnode) and is responsible for releasing it.
4491
4492	\param path The path where to create the entry for the node. Can be \c NULL,
4493		in which case the node is created without an entry in the root FS -- it
4494		will automatically be deleted when the last reference has been released.
4495	\param subVnode The definition of the subnode. Can be \c NULL, in which case
4496		the target file system will just create the node with its standard
4497		operations. Depending on the type of the node a subnode might be created
4498		automatically, though.
4499	\param mode The type and permissions for the node to be created.
4500	\param flags Flags to be passed to the creating FS.
4501	\param kernel \c true, if called in the kernel context (relevant only if
4502		\a path is not \c NULL and not absolute).
4503	\param _superVnode Pointer to a pre-allocated structure to be filled by the
4504		file system creating the node, with the private data pointer and
4505		operations for the super node. Can be \c NULL.
4506	\param _createVnode Pointer to pre-allocated storage where to store the
4507		pointer to the newly created node.
4508	\return \c B_OK, if everything went fine, another error code otherwise.
4509*/
4510status_t
4511vfs_create_special_node(const char* path, fs_vnode* subVnode, mode_t mode,
4512	uint32 flags, bool kernel, fs_vnode* _superVnode,
4513	struct vnode** _createdVnode)
4514{
4515	VnodePutter dirNode;
4516	char _leaf[B_FILE_NAME_LENGTH];
4517	char* leaf = NULL;
4518
4519	if (path) {
4520		// We've got a path. Get the dir vnode and the leaf name.
4521		KPath tmpPathBuffer;
4522		if (tmpPathBuffer.InitCheck() != B_OK)
4523			return B_NO_MEMORY;
4524
4525		char* tmpPath = tmpPathBuffer.LockBuffer();
4526		if (strlcpy(tmpPath, path, B_PATH_NAME_LENGTH) >= B_PATH_NAME_LENGTH)
4527			return B_NAME_TOO_LONG;
4528
4529		// get the dir vnode and the leaf name
4530		leaf = _leaf;
4531		status_t error = path_to_dir_vnode(tmpPath, dirNode, leaf, kernel);
4532		if (error != B_OK)
4533			return error;
4534	} else {
4535		// No path. Create the node in the root FS.
4536		dirNode.SetTo(sRoot);
4537		inc_vnode_ref_count(dirNode.Get());
4538	}
4539
4540	// check support for creating special nodes
4541	if (!HAS_FS_CALL(dirNode, create_special_node))
4542		return B_UNSUPPORTED;
4543
4544	// create the node
4545	fs_vnode superVnode;
4546	ino_t nodeID;
4547	status_t status = FS_CALL(dirNode.Get(), create_special_node, leaf, subVnode,
4548		mode, flags, _superVnode != NULL ? _superVnode : &superVnode, &nodeID);
4549	if (status != B_OK)
4550		return status;
4551
4552	// lookup the node
4553	rw_lock_read_lock(&sVnodeLock);
4554	*_createdVnode = lookup_vnode(dirNode->mount->id, nodeID);
4555	rw_lock_read_unlock(&sVnodeLock);
4556
4557	if (*_createdVnode == NULL) {
4558		panic("vfs_create_special_node(): lookup of node failed");
4559		return B_ERROR;
4560	}
4561
4562	return B_OK;
4563}
4564
4565
4566extern "C" void
4567vfs_put_vnode(struct vnode* vnode)
4568{
4569	put_vnode(vnode);
4570}
4571
4572
4573extern "C" status_t
4574vfs_get_cwd(dev_t* _mountID, ino_t* _vnodeID)
4575{
4576	// Get current working directory from io context
4577	struct io_context* context = get_current_io_context(false);
4578	status_t status = B_OK;
4579
4580	mutex_lock(&context->io_mutex);
4581
4582	if (context->cwd != NULL) {
4583		*_mountID = context->cwd->device;
4584		*_vnodeID = context->cwd->id;
4585	} else
4586		status = B_ERROR;
4587
4588	mutex_unlock(&context->io_mutex);
4589	return status;
4590}
4591
4592
4593status_t
4594vfs_unmount(dev_t mountID, uint32 flags)
4595{
4596	return fs_unmount(NULL, mountID, flags, true);
4597}
4598
4599
4600extern "C" status_t
4601vfs_disconnect_vnode(dev_t mountID, ino_t vnodeID)
4602{
4603	struct vnode* vnode;
4604
4605	status_t status = get_vnode(mountID, vnodeID, &vnode, true, true);
4606	if (status != B_OK)
4607		return status;
4608
4609	disconnect_mount_or_vnode_fds(vnode->mount, vnode);
4610	put_vnode(vnode);
4611	return B_OK;
4612}
4613
4614
4615extern "C" void
4616vfs_free_unused_vnodes(int32 level)
4617{
4618	vnode_low_resource_handler(NULL,
4619		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
4620			| B_KERNEL_RESOURCE_ADDRESS_SPACE,
4621		level);
4622}
4623
4624
4625extern "C" bool
4626vfs_can_page(struct vnode* vnode, void* cookie)
4627{
4628	FUNCTION(("vfs_canpage: vnode %p\n", vnode));
4629
4630	if (HAS_FS_CALL(vnode, can_page))
4631		return FS_CALL(vnode, can_page, cookie);
4632	return false;
4633}
4634
4635
4636extern "C" status_t
4637vfs_read_pages(struct vnode* vnode, void* cookie, off_t pos,
4638	const generic_io_vec* vecs, size_t count, uint32 flags,
4639	generic_size_t* _numBytes)
4640{
4641	FUNCTION(("vfs_read_pages: vnode %p, vecs %p, pos %" B_PRIdOFF "\n", vnode,
4642		vecs, pos));
4643
4644#if VFS_PAGES_IO_TRACING
4645	generic_size_t bytesRequested = *_numBytes;
4646#endif
4647
4648	IORequest request;
4649	status_t status = request.Init(pos, vecs, count, *_numBytes, false, flags);
4650	if (status == B_OK) {
4651		status = vfs_vnode_io(vnode, cookie, &request);
4652		if (status == B_OK)
4653			status = request.Wait();
4654		*_numBytes = request.TransferredBytes();
4655	}
4656
4657	TPIO(ReadPages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4658		status, *_numBytes));
4659
4660	return status;
4661}
4662
4663
4664extern "C" status_t
4665vfs_write_pages(struct vnode* vnode, void* cookie, off_t pos,
4666	const generic_io_vec* vecs, size_t count, uint32 flags,
4667	generic_size_t* _numBytes)
4668{
4669	FUNCTION(("vfs_write_pages: vnode %p, vecs %p, pos %" B_PRIdOFF "\n", vnode,
4670		vecs, pos));
4671
4672#if VFS_PAGES_IO_TRACING
4673	generic_size_t bytesRequested = *_numBytes;
4674#endif
4675
4676	IORequest request;
4677	status_t status = request.Init(pos, vecs, count, *_numBytes, true, flags);
4678	if (status == B_OK) {
4679		status = vfs_vnode_io(vnode, cookie, &request);
4680		if (status == B_OK)
4681			status = request.Wait();
4682		*_numBytes = request.TransferredBytes();
4683	}
4684
4685	TPIO(WritePages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4686		status, *_numBytes));
4687
4688	return status;
4689}
4690
4691
4692/*!	Gets the vnode's VMCache object. If it didn't have one, it will be
4693	created if \a allocate is \c true.
4694	In case it's successful, it will also grab a reference to the cache
4695	it returns.
4696*/
4697extern "C" status_t
4698vfs_get_vnode_cache(struct vnode* vnode, VMCache** _cache, bool allocate)
4699{
4700	if (vnode->cache != NULL) {
4701		vnode->cache->AcquireRef();
4702		*_cache = vnode->cache;
4703		return B_OK;
4704	}
4705
4706	rw_lock_read_lock(&sVnodeLock);
4707	vnode->Lock();
4708
4709	status_t status = B_OK;
4710
4711	// The cache could have been created in the meantime
4712	if (vnode->cache == NULL) {
4713		if (allocate) {
4714			// TODO: actually the vnode needs to be busy already here, or
4715			//	else this won't work...
4716			bool wasBusy = vnode->IsBusy();
4717			vnode->SetBusy(true);
4718
4719			vnode->Unlock();
4720			rw_lock_read_unlock(&sVnodeLock);
4721
4722			status = vm_create_vnode_cache(vnode, &vnode->cache);
4723
4724			rw_lock_read_lock(&sVnodeLock);
4725			vnode->Lock();
4726			vnode->SetBusy(wasBusy);
4727		} else
4728			status = B_BAD_VALUE;
4729	}
4730
4731	vnode->Unlock();
4732	rw_lock_read_unlock(&sVnodeLock);
4733
4734	if (status == B_OK) {
4735		vnode->cache->AcquireRef();
4736		*_cache = vnode->cache;
4737	}
4738
4739	return status;
4740}
4741
4742
4743/*!	Sets the vnode's VMCache object, for subsystems that want to manage
4744	their own.
4745	In case it's successful, it will also grab a reference to the cache
4746	it returns.
4747*/
4748extern "C" status_t
4749vfs_set_vnode_cache(struct vnode* vnode, VMCache* _cache)
4750{
4751	rw_lock_read_lock(&sVnodeLock);
4752	vnode->Lock();
4753
4754	status_t status = B_OK;
4755	if (vnode->cache != NULL) {
4756		status = B_NOT_ALLOWED;
4757	} else {
4758		vnode->cache = _cache;
4759		_cache->AcquireRef();
4760	}
4761
4762	vnode->Unlock();
4763	rw_lock_read_unlock(&sVnodeLock);
4764	return status;
4765}
4766
4767
4768status_t
4769vfs_get_file_map(struct vnode* vnode, off_t offset, size_t size,
4770	file_io_vec* vecs, size_t* _count)
4771{
4772	FUNCTION(("vfs_get_file_map: vnode %p, vecs %p, offset %" B_PRIdOFF
4773		", size = %" B_PRIuSIZE "\n", vnode, vecs, offset, size));
4774
4775	return FS_CALL(vnode, get_file_map, offset, size, vecs, _count);
4776}
4777
4778
4779status_t
4780vfs_stat_vnode(struct vnode* vnode, struct stat* stat)
4781{
4782	status_t status = FS_CALL(vnode, read_stat, stat);
4783
4784	// fill in the st_dev and st_ino fields
4785	if (status == B_OK) {
4786		stat->st_dev = vnode->device;
4787		stat->st_ino = vnode->id;
4788		// the rdev field must stay unset for non-special files
4789		if (!S_ISBLK(stat->st_mode) && !S_ISCHR(stat->st_mode))
4790			stat->st_rdev = -1;
4791	}
4792
4793	return status;
4794}
4795
4796
4797status_t
4798vfs_stat_node_ref(dev_t device, ino_t inode, struct stat* stat)
4799{
4800	struct vnode* vnode;
4801	status_t status = get_vnode(device, inode, &vnode, true, false);
4802	if (status != B_OK)
4803		return status;
4804
4805	status = vfs_stat_vnode(vnode, stat);
4806
4807	put_vnode(vnode);
4808	return status;
4809}
4810
4811
4812status_t
4813vfs_get_vnode_name(struct vnode* vnode, char* name, size_t nameSize)
4814{
4815	return get_vnode_name(vnode, NULL, name, nameSize, true);
4816}
4817
4818
4819status_t
4820vfs_entry_ref_to_path(dev_t device, ino_t inode, const char* leaf,
4821	bool kernel, char* path, size_t pathLength)
4822{
4823	VnodePutter vnode;
4824	status_t status;
4825
4826	// filter invalid leaf names
4827	if (leaf != NULL && (leaf[0] == '\0' || strchr(leaf, '/')))
4828		return B_BAD_VALUE;
4829
4830	// get the vnode matching the dir's node_ref
4831	if (leaf && (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0)) {
4832		// special cases "." and "..": we can directly get the vnode of the
4833		// referenced directory
4834		status = entry_ref_to_vnode(device, inode, leaf, false, kernel, vnode);
4835		leaf = NULL;
4836	} else {
4837		struct vnode* temp = NULL;
4838		status = get_vnode(device, inode, &temp, true, false);
4839		vnode.SetTo(temp);
4840	}
4841	if (status != B_OK)
4842		return status;
4843
4844	// get the directory path
4845	status = dir_vnode_to_path(vnode.Get(), path, pathLength, kernel);
4846	vnode.Unset();
4847		// we don't need the vnode anymore
4848	if (status != B_OK)
4849		return status;
4850
4851	// append the leaf name
4852	if (leaf) {
4853		// insert a directory separator if this is not the file system root
4854		if ((strcmp(path, "/") && strlcat(path, "/", pathLength)
4855				>= pathLength)
4856			|| strlcat(path, leaf, pathLength) >= pathLength) {
4857			return B_NAME_TOO_LONG;
4858		}
4859	}
4860
4861	return B_OK;
4862}
4863
4864
4865/*!	If the given descriptor locked its vnode, that lock will be released. */
4866void
4867vfs_unlock_vnode_if_locked(struct file_descriptor* descriptor)
4868{
4869	struct vnode* vnode = fd_vnode(descriptor);
4870
4871	if (vnode != NULL && vnode->mandatory_locked_by == descriptor)
4872		vnode->mandatory_locked_by = NULL;
4873}
4874
4875
4876/*!	Releases any POSIX locks on the file descriptor. */
4877status_t
4878vfs_release_posix_lock(io_context* context, struct file_descriptor* descriptor)
4879{
4880	struct vnode* vnode = descriptor->u.vnode;
4881	if (vnode == NULL)
4882		return B_OK;
4883
4884	if (HAS_FS_CALL(vnode, release_lock))
4885		return FS_CALL(vnode, release_lock, descriptor->cookie, NULL);
4886
4887	return release_advisory_lock(vnode, context, NULL, NULL);
4888}
4889
4890
4891/*!	Closes all file descriptors of the specified I/O context that
4892	have the O_CLOEXEC flag set.
4893*/
4894void
4895vfs_exec_io_context(io_context* context)
4896{
4897	uint32 i;
4898
4899	for (i = 0; i < context->table_size; i++) {
4900		mutex_lock(&context->io_mutex);
4901
4902		struct file_descriptor* descriptor = context->fds[i];
4903		bool remove = false;
4904
4905		if (descriptor != NULL && fd_close_on_exec(context, i)) {
4906			context->fds[i] = NULL;
4907			context->num_used_fds--;
4908
4909			remove = true;
4910		}
4911
4912		mutex_unlock(&context->io_mutex);
4913
4914		if (remove) {
4915			close_fd(context, descriptor);
4916			put_fd(descriptor);
4917		}
4918	}
4919}
4920
4921
4922/*! Sets up a new io_control structure, and inherits the properties
4923	of the parent io_control if it is given.
4924*/
4925io_context*
4926vfs_new_io_context(io_context* parentContext, bool purgeCloseOnExec)
4927{
4928	io_context* context = (io_context*)malloc(sizeof(io_context));
4929	if (context == NULL)
4930		return NULL;
4931
4932	TIOC(NewIOContext(context, parentContext));
4933
4934	memset(context, 0, sizeof(io_context));
4935	context->ref_count = 1;
4936
4937	MutexLocker parentLocker;
4938
4939	size_t tableSize;
4940	if (parentContext != NULL) {
4941		parentLocker.SetTo(parentContext->io_mutex, false);
4942		tableSize = parentContext->table_size;
4943	} else
4944		tableSize = DEFAULT_FD_TABLE_SIZE;
4945
4946	// allocate space for FDs and their close-on-exec flag
4947	context->fds = (file_descriptor**)malloc(
4948		sizeof(struct file_descriptor*) * tableSize
4949		+ sizeof(struct select_info**) * tableSize
4950		+ (tableSize + 7) / 8);
4951	if (context->fds == NULL) {
4952		free(context);
4953		return NULL;
4954	}
4955
4956	context->select_infos = (select_info**)(context->fds + tableSize);
4957	context->fds_close_on_exec = (uint8*)(context->select_infos + tableSize);
4958
4959	memset(context->fds, 0, sizeof(struct file_descriptor*) * tableSize
4960		+ sizeof(struct select_info**) * tableSize
4961		+ (tableSize + 7) / 8);
4962
4963	mutex_init(&context->io_mutex, "I/O context");
4964
4965	// Copy all parent file descriptors
4966
4967	if (parentContext != NULL) {
4968		size_t i;
4969
4970		mutex_lock(&sIOContextRootLock);
4971		context->root = parentContext->root;
4972		if (context->root)
4973			inc_vnode_ref_count(context->root);
4974		mutex_unlock(&sIOContextRootLock);
4975
4976		context->cwd = parentContext->cwd;
4977		if (context->cwd)
4978			inc_vnode_ref_count(context->cwd);
4979
4980		if (parentContext->inherit_fds) {
4981			for (i = 0; i < tableSize; i++) {
4982				struct file_descriptor* descriptor = parentContext->fds[i];
4983
4984				if (descriptor != NULL
4985					&& (descriptor->open_mode & O_DISCONNECTED) == 0) {
4986					bool closeOnExec = fd_close_on_exec(parentContext, i);
4987					if (closeOnExec && purgeCloseOnExec)
4988						continue;
4989
4990					TFD(InheritFD(context, i, descriptor, parentContext));
4991
4992					context->fds[i] = descriptor;
4993					context->num_used_fds++;
4994					atomic_add(&descriptor->ref_count, 1);
4995					atomic_add(&descriptor->open_count, 1);
4996
4997					if (closeOnExec)
4998						fd_set_close_on_exec(context, i, true);
4999				}
5000			}
5001		}
5002
5003		parentLocker.Unlock();
5004	} else {
5005		context->root = sRoot;
5006		context->cwd = sRoot;
5007
5008		if (context->root)
5009			inc_vnode_ref_count(context->root);
5010
5011		if (context->cwd)
5012			inc_vnode_ref_count(context->cwd);
5013	}
5014
5015	context->table_size = tableSize;
5016	context->inherit_fds = parentContext != NULL;
5017
5018	list_init(&context->node_monitors);
5019	context->max_monitors = DEFAULT_NODE_MONITORS;
5020
5021	return context;
5022}
5023
5024
5025void
5026vfs_get_io_context(io_context* context)
5027{
5028	atomic_add(&context->ref_count, 1);
5029}
5030
5031
5032void
5033vfs_put_io_context(io_context* context)
5034{
5035	if (atomic_add(&context->ref_count, -1) == 1)
5036		free_io_context(context);
5037}
5038
5039
5040status_t
5041vfs_resize_fd_table(struct io_context* context, uint32 newSize)
5042{
5043	if (newSize == 0 || newSize > MAX_FD_TABLE_SIZE)
5044		return B_BAD_VALUE;
5045
5046	TIOC(ResizeIOContext(context, newSize));
5047
5048	MutexLocker _(context->io_mutex);
5049
5050	uint32 oldSize = context->table_size;
5051	int oldCloseOnExitBitmapSize = (oldSize + 7) / 8;
5052	int newCloseOnExitBitmapSize = (newSize + 7) / 8;
5053
5054	// If the tables shrink, make sure none of the fds being dropped are in use.
5055	if (newSize < oldSize) {
5056		for (uint32 i = oldSize; i-- > newSize;) {
5057			if (context->fds[i])
5058				return B_BUSY;
5059		}
5060	}
5061
5062	// store pointers to the old tables
5063	file_descriptor** oldFDs = context->fds;
5064	select_info** oldSelectInfos = context->select_infos;
5065	uint8* oldCloseOnExecTable = context->fds_close_on_exec;
5066
5067	// allocate new tables
5068	file_descriptor** newFDs = (file_descriptor**)malloc(
5069		sizeof(struct file_descriptor*) * newSize
5070		+ sizeof(struct select_infos**) * newSize
5071		+ newCloseOnExitBitmapSize);
5072	if (newFDs == NULL)
5073		return B_NO_MEMORY;
5074
5075	context->fds = newFDs;
5076	context->select_infos = (select_info**)(context->fds + newSize);
5077	context->fds_close_on_exec = (uint8*)(context->select_infos + newSize);
5078	context->table_size = newSize;
5079
5080	// copy entries from old tables
5081	uint32 toCopy = min_c(oldSize, newSize);
5082
5083	memcpy(context->fds, oldFDs, sizeof(void*) * toCopy);
5084	memcpy(context->select_infos, oldSelectInfos, sizeof(void*) * toCopy);
5085	memcpy(context->fds_close_on_exec, oldCloseOnExecTable,
5086		min_c(oldCloseOnExitBitmapSize, newCloseOnExitBitmapSize));
5087
5088	// clear additional entries, if the tables grow
5089	if (newSize > oldSize) {
5090		memset(context->fds + oldSize, 0, sizeof(void*) * (newSize - oldSize));
5091		memset(context->select_infos + oldSize, 0,
5092			sizeof(void*) * (newSize - oldSize));
5093		memset(context->fds_close_on_exec + oldCloseOnExitBitmapSize, 0,
5094			newCloseOnExitBitmapSize - oldCloseOnExitBitmapSize);
5095	}
5096
5097	free(oldFDs);
5098
5099	return B_OK;
5100}
5101
5102
5103/*!	\brief Resolves a vnode to the vnode it is covered by, if any.
5104
5105	Given an arbitrary vnode (identified by mount and node ID), the function
5106	checks, whether the vnode is covered by another vnode. If it is, the
5107	function returns the mount and node ID of the covering vnode. Otherwise
5108	it simply returns the supplied mount and node ID.
5109
5110	In case of error (e.g. the supplied node could not be found) the variables
5111	for storing the resolved mount and node ID remain untouched and an error
5112	code is returned.
5113
5114	\param mountID The mount ID of the vnode in question.
5115	\param nodeID The node ID of the vnode in question.
5116	\param resolvedMountID Pointer to storage for the resolved mount ID.
5117	\param resolvedNodeID Pointer to storage for the resolved node ID.
5118	\return
5119	- \c B_OK, if everything went fine,
5120	- another error code, if something went wrong.
5121*/
5122status_t
5123vfs_resolve_vnode_to_covering_vnode(dev_t mountID, ino_t nodeID,
5124	dev_t* resolvedMountID, ino_t* resolvedNodeID)
5125{
5126	// get the node
5127	struct vnode* node;
5128	status_t error = get_vnode(mountID, nodeID, &node, true, false);
5129	if (error != B_OK)
5130		return error;
5131
5132	// resolve the node
5133	if (Vnode* coveringNode = get_covering_vnode(node)) {
5134		put_vnode(node);
5135		node = coveringNode;
5136	}
5137
5138	// set the return values
5139	*resolvedMountID = node->device;
5140	*resolvedNodeID = node->id;
5141
5142	put_vnode(node);
5143
5144	return B_OK;
5145}
5146
5147
5148status_t
5149vfs_get_mount_point(dev_t mountID, dev_t* _mountPointMountID,
5150	ino_t* _mountPointNodeID)
5151{
5152	ReadLocker nodeLocker(sVnodeLock);
5153	ReadLocker mountLocker(sMountLock);
5154
5155	struct fs_mount* mount = find_mount(mountID);
5156	if (mount == NULL)
5157		return B_BAD_VALUE;
5158
5159	Vnode* mountPoint = mount->covers_vnode;
5160
5161	*_mountPointMountID = mountPoint->device;
5162	*_mountPointNodeID = mountPoint->id;
5163
5164	return B_OK;
5165}
5166
5167
5168status_t
5169vfs_bind_mount_directory(dev_t mountID, ino_t nodeID, dev_t coveredMountID,
5170	ino_t coveredNodeID)
5171{
5172	// get the vnodes
5173	Vnode* vnode;
5174	status_t error = get_vnode(mountID, nodeID, &vnode, true, false);
5175	if (error != B_OK)
5176		return B_BAD_VALUE;
5177	VnodePutter vnodePutter(vnode);
5178
5179	Vnode* coveredVnode;
5180	error = get_vnode(coveredMountID, coveredNodeID, &coveredVnode, true,
5181		false);
5182	if (error != B_OK)
5183		return B_BAD_VALUE;
5184	VnodePutter coveredVnodePutter(coveredVnode);
5185
5186	// establish the covered/covering links
5187	WriteLocker locker(sVnodeLock);
5188
5189	if (vnode->covers != NULL || coveredVnode->covered_by != NULL
5190		|| vnode->mount->unmounting || coveredVnode->mount->unmounting) {
5191		return B_BUSY;
5192	}
5193
5194	vnode->covers = coveredVnode;
5195	vnode->SetCovering(true);
5196
5197	coveredVnode->covered_by = vnode;
5198	coveredVnode->SetCovered(true);
5199
5200	// the vnodes do now reference each other
5201	inc_vnode_ref_count(vnode);
5202	inc_vnode_ref_count(coveredVnode);
5203
5204	return B_OK;
5205}
5206
5207
5208int
5209vfs_getrlimit(int resource, struct rlimit* rlp)
5210{
5211	if (!rlp)
5212		return B_BAD_ADDRESS;
5213
5214	switch (resource) {
5215		case RLIMIT_NOFILE:
5216		{
5217			struct io_context* context = get_current_io_context(false);
5218			MutexLocker _(context->io_mutex);
5219
5220			rlp->rlim_cur = context->table_size;
5221			rlp->rlim_max = MAX_FD_TABLE_SIZE;
5222			return 0;
5223		}
5224
5225		case RLIMIT_NOVMON:
5226		{
5227			struct io_context* context = get_current_io_context(false);
5228			MutexLocker _(context->io_mutex);
5229
5230			rlp->rlim_cur = context->max_monitors;
5231			rlp->rlim_max = MAX_NODE_MONITORS;
5232			return 0;
5233		}
5234
5235		default:
5236			return B_BAD_VALUE;
5237	}
5238}
5239
5240
5241int
5242vfs_setrlimit(int resource, const struct rlimit* rlp)
5243{
5244	if (!rlp)
5245		return B_BAD_ADDRESS;
5246
5247	switch (resource) {
5248		case RLIMIT_NOFILE:
5249			/* TODO: check getuid() */
5250			if (rlp->rlim_max != RLIM_SAVED_MAX
5251				&& rlp->rlim_max != MAX_FD_TABLE_SIZE)
5252				return B_NOT_ALLOWED;
5253
5254			return vfs_resize_fd_table(get_current_io_context(false),
5255				rlp->rlim_cur);
5256
5257		case RLIMIT_NOVMON:
5258			/* TODO: check getuid() */
5259			if (rlp->rlim_max != RLIM_SAVED_MAX
5260				&& rlp->rlim_max != MAX_NODE_MONITORS)
5261				return B_NOT_ALLOWED;
5262
5263			return resize_monitor_table(get_current_io_context(false),
5264				rlp->rlim_cur);
5265
5266		default:
5267			return B_BAD_VALUE;
5268	}
5269}
5270
5271
5272status_t
5273vfs_init(kernel_args* args)
5274{
5275	vnode::StaticInit();
5276
5277	sVnodeTable = new(std::nothrow) VnodeTable();
5278	if (sVnodeTable == NULL || sVnodeTable->Init(VNODE_HASH_TABLE_SIZE) != B_OK)
5279		panic("vfs_init: error creating vnode hash table\n");
5280
5281	struct vnode dummy_vnode;
5282	list_init_etc(&sUnusedVnodeList, offset_of_member(dummy_vnode, unused_link));
5283
5284	struct fs_mount dummyMount;
5285	sMountsTable = new(std::nothrow) MountTable();
5286	if (sMountsTable == NULL
5287			|| sMountsTable->Init(MOUNTS_HASH_TABLE_SIZE) != B_OK)
5288		panic("vfs_init: error creating mounts hash table\n");
5289
5290	sPathNameCache = create_object_cache("vfs path names",
5291		B_PATH_NAME_LENGTH + 1, 8, NULL, NULL, NULL);
5292	if (sPathNameCache == NULL)
5293		panic("vfs_init: error creating path name object_cache\n");
5294
5295	sVnodeCache = create_object_cache("vfs vnodes",
5296		sizeof(struct vnode), 8, NULL, NULL, NULL);
5297	if (sVnodeCache == NULL)
5298		panic("vfs_init: error creating vnode object_cache\n");
5299
5300	sFileDescriptorCache = create_object_cache("vfs fds",
5301		sizeof(file_descriptor), 8, NULL, NULL, NULL);
5302	if (sFileDescriptorCache == NULL)
5303		panic("vfs_init: error creating file descriptor object_cache\n");
5304
5305	node_monitor_init();
5306
5307	sRoot = NULL;
5308
5309	recursive_lock_init(&sMountOpLock, "vfs_mount_op_lock");
5310
5311	if (block_cache_init() != B_OK)
5312		return B_ERROR;
5313
5314#ifdef ADD_DEBUGGER_COMMANDS
5315	// add some debugger commands
5316	add_debugger_command_etc("vnode", &dump_vnode,
5317		"Print info about the specified vnode",
5318		"[ \"-p\" ] ( <vnode> | <devID> <nodeID> )\n"
5319		"Prints information about the vnode specified by address <vnode> or\n"
5320		"<devID>, <vnodeID> pair. If \"-p\" is given, a path of the vnode is\n"
5321		"constructed and printed. It might not be possible to construct a\n"
5322		"complete path, though.\n",
5323		0);
5324	add_debugger_command("vnodes", &dump_vnodes,
5325		"list all vnodes (from the specified device)");
5326	add_debugger_command("vnode_caches", &dump_vnode_caches,
5327		"list all vnode caches");
5328	add_debugger_command("mount", &dump_mount,
5329		"info about the specified fs_mount");
5330	add_debugger_command("mounts", &dump_mounts, "list all fs_mounts");
5331	add_debugger_command("io_context", &dump_io_context,
5332		"info about the I/O context");
5333	add_debugger_command("vnode_usage", &dump_vnode_usage,
5334		"info about vnode usage");
5335#endif
5336
5337	register_low_resource_handler(&vnode_low_resource_handler, NULL,
5338		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
5339			| B_KERNEL_RESOURCE_ADDRESS_SPACE,
5340		0);
5341
5342	fifo_init();
5343	file_map_init();
5344
5345	return file_cache_init();
5346}
5347
5348
5349//	#pragma mark - fd_ops implementations
5350
5351
5352/*!
5353	Calls fs_open() on the given vnode and returns a new
5354	file descriptor for it
5355*/
5356static int
5357open_vnode(struct vnode* vnode, int openMode, bool kernel)
5358{
5359	void* cookie;
5360	status_t status = FS_CALL(vnode, open, openMode, &cookie);
5361	if (status != B_OK)
5362		return status;
5363
5364	int fd = get_new_fd(&sFileOps, NULL, vnode, cookie, openMode, kernel);
5365	if (fd < 0) {
5366		FS_CALL(vnode, close, cookie);
5367		FS_CALL(vnode, free_cookie, cookie);
5368	}
5369	return fd;
5370}
5371
5372
5373/*!
5374	Calls fs_open() on the given vnode and returns a new
5375	file descriptor for it
5376*/
5377static int
5378create_vnode(struct vnode* directory, const char* name, int openMode,
5379	int perms, bool kernel)
5380{
5381	bool traverse = ((openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0);
5382	status_t status = B_ERROR;
5383	VnodePutter vnode, dirPutter;
5384	void* cookie;
5385	ino_t newID;
5386	char clonedName[B_FILE_NAME_LENGTH + 1];
5387
5388	// This is somewhat tricky: If the entry already exists, the FS responsible
5389	// for the directory might not necessarily also be the one responsible for
5390	// the node the entry refers to (e.g. in case of mount points or FIFOs). So
5391	// we can actually never call the create() hook without O_EXCL. Instead we
5392	// try to look the entry up first. If it already exists, we just open the
5393	// node (unless O_EXCL), otherwise we call create() with O_EXCL. This
5394	// introduces a race condition, since someone else might have created the
5395	// entry in the meantime. We hope the respective FS returns the correct
5396	// error code and retry (up to 3 times) again.
5397
5398	for (int i = 0; i < 3 && status != B_OK; i++) {
5399		bool create = false;
5400
5401		// look the node up
5402		{
5403			struct vnode* entry = NULL;
5404			status = lookup_dir_entry(directory, name, &entry);
5405			vnode.SetTo(entry);
5406		}
5407		if (status == B_OK) {
5408			if ((openMode & O_EXCL) != 0)
5409				return B_FILE_EXISTS;
5410
5411			// If the node is a symlink, we have to follow it, unless
5412			// O_NOTRAVERSE is set.
5413			if (S_ISLNK(vnode->Type()) && traverse) {
5414				vnode.Unset();
5415				if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH)
5416						>= B_FILE_NAME_LENGTH) {
5417					return B_NAME_TOO_LONG;
5418				}
5419
5420				inc_vnode_ref_count(directory);
5421				dirPutter.Unset();
5422				status = vnode_path_to_vnode(directory, clonedName, true,
5423					kernel, vnode, NULL, clonedName);
5424				if (status != B_OK) {
5425					// vnode is not found, but maybe it has a parent and we can create it from
5426					// there. In that case, vnode_path_to_vnode has set vnode to the latest
5427					// directory found in the path
5428					if (status == B_ENTRY_NOT_FOUND) {
5429						directory = vnode.Detach();
5430						dirPutter.SetTo(directory);
5431						name = clonedName;
5432						create = true;
5433					} else
5434						return status;
5435				}
5436			}
5437
5438			if (!create) {
5439				if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type()))
5440					return B_LINK_LIMIT;
5441
5442				int fd = open_vnode(vnode.Get(), openMode & ~O_CREAT, kernel);
5443				// on success keep the vnode reference for the FD
5444				if (fd >= 0)
5445					vnode.Detach();
5446
5447				return fd;
5448			}
5449		}
5450
5451		// it doesn't exist yet -- try to create it
5452
5453		if (!HAS_FS_CALL(directory, create))
5454			return B_READ_ONLY_DEVICE;
5455
5456		status = FS_CALL(directory, create, name, openMode | O_EXCL, perms,
5457			&cookie, &newID);
5458		if (status != B_OK
5459			&& ((openMode & O_EXCL) != 0 || status != B_FILE_EXISTS)) {
5460			return status;
5461		}
5462	}
5463
5464	if (status != B_OK)
5465		return status;
5466
5467	// the node has been created successfully
5468
5469	rw_lock_read_lock(&sVnodeLock);
5470	vnode.SetTo(lookup_vnode(directory->device, newID));
5471	rw_lock_read_unlock(&sVnodeLock);
5472
5473	if (!vnode.IsSet()) {
5474		panic("vfs: fs_create() returned success but there is no vnode, "
5475			"mount ID %" B_PRIdDEV "!\n", directory->device);
5476		return B_BAD_VALUE;
5477	}
5478
5479	int fd = get_new_fd(&sFileOps, NULL, vnode.Get(), cookie, openMode, kernel);
5480	if (fd >= 0) {
5481		vnode.Detach();
5482		return fd;
5483	}
5484
5485	status = fd;
5486
5487	// something went wrong, clean up
5488
5489	FS_CALL(vnode.Get(), close, cookie);
5490	FS_CALL(vnode.Get(), free_cookie, cookie);
5491
5492	FS_CALL(directory, unlink, name);
5493
5494	return status;
5495}
5496
5497
5498/*! Calls fs open_dir() on the given vnode and returns a new
5499	file descriptor for it
5500*/
5501static int
5502open_dir_vnode(struct vnode* vnode, bool kernel)
5503{
5504	if (!HAS_FS_CALL(vnode, open_dir))
5505		return B_UNSUPPORTED;
5506
5507	void* cookie;
5508	status_t status = FS_CALL(vnode, open_dir, &cookie);
5509	if (status != B_OK)
5510		return status;
5511
5512	// directory is opened, create a fd
5513	status = get_new_fd(&sDirectoryOps, NULL, vnode, cookie, O_CLOEXEC, kernel);
5514	if (status >= 0)
5515		return status;
5516
5517	FS_CALL(vnode, close_dir, cookie);
5518	FS_CALL(vnode, free_dir_cookie, cookie);
5519
5520	return status;
5521}
5522
5523
5524/*! Calls fs open_attr_dir() on the given vnode and returns a new
5525	file descriptor for it.
5526	Used by attr_dir_open(), and attr_dir_open_fd().
5527*/
5528static int
5529open_attr_dir_vnode(struct vnode* vnode, bool kernel)
5530{
5531	if (!HAS_FS_CALL(vnode, open_attr_dir))
5532		return B_UNSUPPORTED;
5533
5534	void* cookie;
5535	status_t status = FS_CALL(vnode, open_attr_dir, &cookie);
5536	if (status != B_OK)
5537		return status;
5538
5539	// directory is opened, create a fd
5540	status = get_new_fd(&sAttributeDirectoryOps, NULL, vnode, cookie, O_CLOEXEC,
5541		kernel);
5542	if (status >= 0)
5543		return status;
5544
5545	FS_CALL(vnode, close_attr_dir, cookie);
5546	FS_CALL(vnode, free_attr_dir_cookie, cookie);
5547
5548	return status;
5549}
5550
5551
5552static int
5553file_create_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5554	int openMode, int perms, bool kernel)
5555{
5556	FUNCTION(("file_create_entry_ref: name = '%s', omode %x, perms %d, "
5557		"kernel %d\n", name, openMode, perms, kernel));
5558
5559	// get directory to put the new file in
5560	struct vnode* directory;
5561	status_t status = get_vnode(mountID, directoryID, &directory, true, false);
5562	if (status != B_OK)
5563		return status;
5564
5565	status = create_vnode(directory, name, openMode, perms, kernel);
5566	put_vnode(directory);
5567
5568	return status;
5569}
5570
5571
5572static int
5573file_create(int fd, char* path, int openMode, int perms, bool kernel)
5574{
5575	FUNCTION(("file_create: path '%s', omode %x, perms %d, kernel %d\n", path,
5576		openMode, perms, kernel));
5577
5578	// get directory to put the new file in
5579	char name[B_FILE_NAME_LENGTH];
5580	VnodePutter directory;
5581	status_t status = fd_and_path_to_dir_vnode(fd, path, directory, name,
5582		kernel);
5583	if (status < 0)
5584		return status;
5585
5586	return create_vnode(directory.Get(), name, openMode, perms, kernel);
5587}
5588
5589
5590static int
5591file_open_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5592	int openMode, bool kernel)
5593{
5594	if (name == NULL || *name == '\0')
5595		return B_BAD_VALUE;
5596
5597	FUNCTION(("file_open_entry_ref(ref = (%" B_PRId32 ", %" B_PRId64 ", %s), "
5598		"openMode = %d)\n", mountID, directoryID, name, openMode));
5599
5600	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5601
5602	// get the vnode matching the entry_ref
5603	VnodePutter vnode;
5604	status_t status = entry_ref_to_vnode(mountID, directoryID, name, traverse,
5605		kernel, vnode);
5606	if (status != B_OK)
5607		return status;
5608
5609	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type()))
5610		return B_LINK_LIMIT;
5611
5612	int newFD = open_vnode(vnode.Get(), openMode, kernel);
5613	if (newFD >= 0) {
5614		cache_node_opened(vnode.Get(), vnode->cache, mountID,
5615			directoryID, vnode->id, name);
5616
5617		// The vnode reference has been transferred to the FD
5618		vnode.Detach();
5619	}
5620
5621	return newFD;
5622}
5623
5624
5625static int
5626file_open(int fd, char* path, int openMode, bool kernel)
5627{
5628	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5629
5630	FUNCTION(("file_open: fd: %d, entry path = '%s', omode %d, kernel %d\n",
5631		fd, path, openMode, kernel));
5632
5633	// get the vnode matching the vnode + path combination
5634	VnodePutter vnode;
5635	ino_t parentID;
5636	status_t status = fd_and_path_to_vnode(fd, path, traverse, vnode,
5637		&parentID, kernel);
5638	if (status != B_OK)
5639		return status;
5640
5641	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type()))
5642		return B_LINK_LIMIT;
5643
5644	// open the vnode
5645	int newFD = open_vnode(vnode.Get(), openMode, kernel);
5646	if (newFD >= 0) {
5647		cache_node_opened(vnode.Get(), vnode->cache,
5648			vnode->device, parentID, vnode->id, NULL);
5649
5650		// The vnode reference has been transferred to the FD
5651		vnode.Detach();
5652	}
5653
5654	return newFD;
5655}
5656
5657
5658static status_t
5659file_close(struct file_descriptor* descriptor)
5660{
5661	struct vnode* vnode = descriptor->u.vnode;
5662	status_t status = B_OK;
5663
5664	FUNCTION(("file_close(descriptor = %p)\n", descriptor));
5665
5666	cache_node_closed(vnode, vnode->cache, vnode->device,
5667		vnode->id);
5668	if (HAS_FS_CALL(vnode, close)) {
5669		status = FS_CALL(vnode, close, descriptor->cookie);
5670	}
5671
5672	if (status == B_OK) {
5673		// remove all outstanding locks for this team
5674		if (HAS_FS_CALL(vnode, release_lock))
5675			status = FS_CALL(vnode, release_lock, descriptor->cookie, NULL);
5676		else
5677			status = release_advisory_lock(vnode, NULL, descriptor, NULL);
5678	}
5679	return status;
5680}
5681
5682
5683static void
5684file_free_fd(struct file_descriptor* descriptor)
5685{
5686	struct vnode* vnode = descriptor->u.vnode;
5687
5688	if (vnode != NULL) {
5689		FS_CALL(vnode, free_cookie, descriptor->cookie);
5690		put_vnode(vnode);
5691	}
5692}
5693
5694
5695static status_t
5696file_read(struct file_descriptor* descriptor, off_t pos, void* buffer,
5697	size_t* length)
5698{
5699	struct vnode* vnode = descriptor->u.vnode;
5700	FUNCTION(("file_read: buf %p, pos %" B_PRIdOFF ", len %p = %ld\n", buffer,
5701		pos, length, *length));
5702
5703	if (S_ISDIR(vnode->Type()))
5704		return B_IS_A_DIRECTORY;
5705	if (pos != -1 && descriptor->pos == -1)
5706		return ESPIPE;
5707
5708	return FS_CALL(vnode, read, descriptor->cookie, pos, buffer, length);
5709}
5710
5711
5712static status_t
5713file_write(struct file_descriptor* descriptor, off_t pos, const void* buffer,
5714	size_t* length)
5715{
5716	struct vnode* vnode = descriptor->u.vnode;
5717	FUNCTION(("file_write: buf %p, pos %" B_PRIdOFF ", len %p\n", buffer, pos,
5718		length));
5719
5720	if (S_ISDIR(vnode->Type()))
5721		return B_IS_A_DIRECTORY;
5722	if (pos != -1 && descriptor->pos == -1)
5723		return ESPIPE;
5724
5725	if (!HAS_FS_CALL(vnode, write))
5726		return B_READ_ONLY_DEVICE;
5727
5728	return FS_CALL(vnode, write, descriptor->cookie, pos, buffer, length);
5729}
5730
5731
5732static ssize_t
5733file_vector_io(struct file_descriptor* descriptor, off_t pos,
5734	const struct iovec *vecs, int count, bool write)
5735{
5736	struct vnode* vnode = descriptor->u.vnode;
5737	if (pos != -1 && descriptor->pos == -1)
5738		return ESPIPE;
5739	if (S_ISDIR(vnode->Type()))
5740		return B_IS_A_DIRECTORY;
5741
5742	if (pos == -1)
5743		return B_UNSUPPORTED;
5744	if (!HAS_FS_CALL(vnode, io))
5745		return B_UNSUPPORTED;
5746
5747	// We can only perform real vectored I/O for vnodes that have no cache,
5748	// because the I/O hook bypasses the cache entirely.
5749	if (vnode->cache != NULL)
5750		return B_UNSUPPORTED;
5751
5752	BStackOrHeapArray<generic_io_vec, 8> iovecs(count);
5753	if (!iovecs.IsValid())
5754		return B_NO_MEMORY;
5755
5756	generic_size_t length = 0;
5757	for (int i = 0; i < count; i++) {
5758		iovecs[i].base = (generic_addr_t)vecs[i].iov_base;
5759		iovecs[i].length = vecs[i].iov_len;
5760		length += vecs[i].iov_len;
5761	}
5762
5763	status_t status = (write ? vfs_write_pages : vfs_read_pages)(vnode,
5764		descriptor->cookie, pos, iovecs, count, 0, &length);
5765	if (length > 0)
5766		return length;
5767	return status;
5768}
5769
5770
5771static ssize_t
5772file_readv(struct file_descriptor* descriptor, off_t pos,
5773	const struct iovec *vecs, int count)
5774{
5775	FUNCTION(("file_readv: pos %" B_PRIdOFF "\n", pos));
5776	return file_vector_io(descriptor, pos, vecs, count, false);
5777}
5778
5779
5780static ssize_t
5781file_writev(struct file_descriptor* descriptor, off_t pos,
5782	const struct iovec *vecs, int count)
5783{
5784	FUNCTION(("file_writev: pos %" B_PRIdOFF "\n", pos));
5785	return file_vector_io(descriptor, pos, vecs, count, true);
5786}
5787
5788
5789static off_t
5790file_seek(struct file_descriptor* descriptor, off_t pos, int seekType)
5791{
5792	struct vnode* vnode = descriptor->u.vnode;
5793	off_t offset;
5794	bool isDevice = false;
5795
5796	FUNCTION(("file_seek(pos = %" B_PRIdOFF ", seekType = %d)\n", pos,
5797		seekType));
5798
5799	if (descriptor->pos == -1)
5800		return ESPIPE;
5801
5802	switch (vnode->Type() & S_IFMT) {
5803		// drivers publish block devices as chr, so pick both
5804		case S_IFBLK:
5805		case S_IFCHR:
5806			isDevice = true;
5807			break;
5808	}
5809
5810	switch (seekType) {
5811		case SEEK_SET:
5812			offset = 0;
5813			break;
5814		case SEEK_CUR:
5815			offset = descriptor->pos;
5816			break;
5817		case SEEK_END:
5818		{
5819			// stat() the node
5820			if (!HAS_FS_CALL(vnode, read_stat))
5821				return B_UNSUPPORTED;
5822
5823			struct stat stat;
5824			status_t status = FS_CALL(vnode, read_stat, &stat);
5825			if (status != B_OK)
5826				return status;
5827
5828			offset = stat.st_size;
5829
5830			if (offset == 0 && isDevice) {
5831				// stat() on regular drivers doesn't report size
5832				device_geometry geometry;
5833
5834				if (HAS_FS_CALL(vnode, ioctl)) {
5835					status = FS_CALL(vnode, ioctl, descriptor->cookie,
5836						B_GET_GEOMETRY, &geometry, sizeof(geometry));
5837					if (status == B_OK)
5838						offset = (off_t)geometry.bytes_per_sector
5839							* geometry.sectors_per_track
5840							* geometry.cylinder_count
5841							* geometry.head_count;
5842				}
5843			}
5844
5845			break;
5846		}
5847		case SEEK_DATA:
5848		case SEEK_HOLE:
5849		{
5850			status_t status = B_BAD_VALUE;
5851			if (HAS_FS_CALL(vnode, ioctl)) {
5852				offset = pos;
5853				status = FS_CALL(vnode, ioctl, descriptor->cookie,
5854					seekType == SEEK_DATA ? FIOSEEKDATA : FIOSEEKHOLE,
5855					&offset, sizeof(offset));
5856				if (status == B_OK) {
5857					if (offset > pos)
5858						offset -= pos;
5859					break;
5860				}
5861			}
5862			if (status != B_BAD_VALUE && status != B_DEV_INVALID_IOCTL)
5863				return status;
5864
5865			// basic implementation with stat() the node
5866			if (!HAS_FS_CALL(vnode, read_stat) || isDevice)
5867				return B_BAD_VALUE;
5868
5869			struct stat stat;
5870			status = FS_CALL(vnode, read_stat, &stat);
5871			if (status != B_OK)
5872				return status;
5873
5874			off_t end = stat.st_size;
5875			if (pos >= end)
5876				return ENXIO;
5877			offset = seekType == SEEK_HOLE ? end - pos : 0;
5878			break;
5879		}
5880		default:
5881			return B_BAD_VALUE;
5882	}
5883
5884	// assumes off_t is 64 bits wide
5885	if (offset > 0 && LONGLONG_MAX - offset < pos)
5886		return B_BUFFER_OVERFLOW;
5887
5888	pos += offset;
5889	if (pos < 0)
5890		return B_BAD_VALUE;
5891
5892	return descriptor->pos = pos;
5893}
5894
5895
5896static status_t
5897file_select(struct file_descriptor* descriptor, uint8 event,
5898	struct selectsync* sync)
5899{
5900	FUNCTION(("file_select(%p, %u, %p)\n", descriptor, event, sync));
5901
5902	struct vnode* vnode = descriptor->u.vnode;
5903
5904	// If the FS has no select() hook, notify select() now.
5905	if (!HAS_FS_CALL(vnode, select)) {
5906		if (!SELECT_TYPE_IS_OUTPUT_ONLY(event))
5907			notify_select_event(sync, event);
5908		return B_UNSUPPORTED;
5909	}
5910
5911	return FS_CALL(vnode, select, descriptor->cookie, event, sync);
5912}
5913
5914
5915static status_t
5916file_deselect(struct file_descriptor* descriptor, uint8 event,
5917	struct selectsync* sync)
5918{
5919	struct vnode* vnode = descriptor->u.vnode;
5920
5921	if (!HAS_FS_CALL(vnode, deselect))
5922		return B_OK;
5923
5924	return FS_CALL(vnode, deselect, descriptor->cookie, event, sync);
5925}
5926
5927
5928static status_t
5929dir_create_entry_ref(dev_t mountID, ino_t parentID, const char* name, int perms,
5930	bool kernel)
5931{
5932	struct vnode* vnode;
5933	status_t status;
5934
5935	if (name == NULL || *name == '\0')
5936		return B_BAD_VALUE;
5937
5938	FUNCTION(("dir_create_entry_ref(dev = %" B_PRId32 ", ino = %" B_PRId64 ", "
5939		"name = '%s', perms = %d)\n", mountID, parentID, name, perms));
5940
5941	status = get_vnode(mountID, parentID, &vnode, true, false);
5942	if (status != B_OK)
5943		return status;
5944
5945	if (HAS_FS_CALL(vnode, create_dir))
5946		status = FS_CALL(vnode, create_dir, name, perms);
5947	else
5948		status = B_READ_ONLY_DEVICE;
5949
5950	put_vnode(vnode);
5951	return status;
5952}
5953
5954
5955static status_t
5956dir_create(int fd, char* path, int perms, bool kernel)
5957{
5958	char filename[B_FILE_NAME_LENGTH];
5959	status_t status;
5960
5961	FUNCTION(("dir_create: path '%s', perms %d, kernel %d\n", path, perms,
5962		kernel));
5963
5964	VnodePutter vnode;
5965	status = fd_and_path_to_dir_vnode(fd, path, vnode, filename, kernel);
5966	if (status < 0)
5967		return status;
5968
5969	if (HAS_FS_CALL(vnode, create_dir)) {
5970		status = FS_CALL(vnode.Get(), create_dir, filename, perms);
5971	} else
5972		status = B_READ_ONLY_DEVICE;
5973
5974	return status;
5975}
5976
5977
5978static int
5979dir_open_entry_ref(dev_t mountID, ino_t parentID, const char* name, bool kernel)
5980{
5981	FUNCTION(("dir_open_entry_ref()\n"));
5982
5983	if (name && name[0] == '\0')
5984		return B_BAD_VALUE;
5985
5986	// get the vnode matching the entry_ref/node_ref
5987	VnodePutter vnode;
5988	status_t status;
5989	if (name) {
5990		status = entry_ref_to_vnode(mountID, parentID, name, true, kernel,
5991			vnode);
5992	} else {
5993		struct vnode* temp = NULL;
5994		status = get_vnode(mountID, parentID, &temp, true, false);
5995		vnode.SetTo(temp);
5996	}
5997	if (status != B_OK)
5998		return status;
5999
6000	int newFD = open_dir_vnode(vnode.Get(), kernel);
6001	if (newFD >= 0) {
6002		cache_node_opened(vnode.Get(), vnode->cache, mountID, parentID,
6003			vnode->id, name);
6004
6005		// The vnode reference has been transferred to the FD
6006		vnode.Detach();
6007	}
6008
6009	return newFD;
6010}
6011
6012
6013static int
6014dir_open(int fd, char* path, bool kernel)
6015{
6016	FUNCTION(("dir_open: fd: %d, entry path = '%s', kernel %d\n", fd, path,
6017		kernel));
6018
6019	// get the vnode matching the vnode + path combination
6020	VnodePutter vnode;
6021	ino_t parentID;
6022	status_t status = fd_and_path_to_vnode(fd, path, true, vnode, &parentID,
6023		kernel);
6024	if (status != B_OK)
6025		return status;
6026
6027	// open the dir
6028	int newFD = open_dir_vnode(vnode.Get(), kernel);
6029	if (newFD >= 0) {
6030		cache_node_opened(vnode.Get(), vnode->cache, vnode->device,
6031			parentID, vnode->id, NULL);
6032
6033		// The vnode reference has been transferred to the FD
6034		vnode.Detach();
6035	}
6036
6037	return newFD;
6038}
6039
6040
6041static status_t
6042dir_close(struct file_descriptor* descriptor)
6043{
6044	struct vnode* vnode = descriptor->u.vnode;
6045
6046	FUNCTION(("dir_close(descriptor = %p)\n", descriptor));
6047
6048	cache_node_closed(vnode, vnode->cache, vnode->device,
6049		vnode->id);
6050	if (HAS_FS_CALL(vnode, close_dir))
6051		return FS_CALL(vnode, close_dir, descriptor->cookie);
6052
6053	return B_OK;
6054}
6055
6056
6057static void
6058dir_free_fd(struct file_descriptor* descriptor)
6059{
6060	struct vnode* vnode = descriptor->u.vnode;
6061
6062	if (vnode != NULL) {
6063		FS_CALL(vnode, free_dir_cookie, descriptor->cookie);
6064		put_vnode(vnode);
6065	}
6066}
6067
6068
6069static status_t
6070dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
6071	struct dirent* buffer, size_t bufferSize, uint32* _count)
6072{
6073	return dir_read(ioContext, descriptor->u.vnode, descriptor->cookie, buffer,
6074		bufferSize, _count);
6075}
6076
6077
6078static status_t
6079fix_dirent(struct vnode* parent, struct dirent* entry,
6080	struct io_context* ioContext)
6081{
6082	// set d_pdev and d_pino
6083	entry->d_pdev = parent->device;
6084	entry->d_pino = parent->id;
6085
6086	// If this is the ".." entry and the directory covering another vnode,
6087	// we need to replace d_dev and d_ino with the actual values.
6088	if (strcmp(entry->d_name, "..") == 0 && parent->IsCovering()) {
6089		return resolve_covered_parent(parent, &entry->d_dev, &entry->d_ino,
6090			ioContext);
6091	}
6092
6093	// resolve covered vnodes
6094	ReadLocker _(&sVnodeLock);
6095
6096	struct vnode* vnode = lookup_vnode(entry->d_dev, entry->d_ino);
6097	if (vnode != NULL && vnode->covered_by != NULL) {
6098		do {
6099			vnode = vnode->covered_by;
6100		} while (vnode->covered_by != NULL);
6101
6102		entry->d_dev = vnode->device;
6103		entry->d_ino = vnode->id;
6104	}
6105
6106	return B_OK;
6107}
6108
6109
6110static status_t
6111dir_read(struct io_context* ioContext, struct vnode* vnode, void* cookie,
6112	struct dirent* buffer, size_t bufferSize, uint32* _count)
6113{
6114	if (!HAS_FS_CALL(vnode, read_dir))
6115		return B_UNSUPPORTED;
6116
6117	status_t error = FS_CALL(vnode, read_dir, cookie, buffer, bufferSize,
6118		_count);
6119	if (error != B_OK)
6120		return error;
6121
6122	// we need to adjust the read dirents
6123	uint32 count = *_count;
6124	for (uint32 i = 0; i < count; i++) {
6125		error = fix_dirent(vnode, buffer, ioContext);
6126		if (error != B_OK)
6127			return error;
6128
6129		buffer = (struct dirent*)((uint8*)buffer + buffer->d_reclen);
6130	}
6131
6132	return error;
6133}
6134
6135
6136static status_t
6137dir_rewind(struct file_descriptor* descriptor)
6138{
6139	struct vnode* vnode = descriptor->u.vnode;
6140
6141	if (HAS_FS_CALL(vnode, rewind_dir)) {
6142		return FS_CALL(vnode, rewind_dir, descriptor->cookie);
6143	}
6144
6145	return B_UNSUPPORTED;
6146}
6147
6148
6149static status_t
6150dir_remove(int fd, char* path, bool kernel)
6151{
6152	char name[B_FILE_NAME_LENGTH];
6153	status_t status;
6154
6155	if (path != NULL) {
6156		// we need to make sure our path name doesn't stop with "/", ".",
6157		// or ".."
6158		char* lastSlash;
6159		while ((lastSlash = strrchr(path, '/')) != NULL) {
6160			char* leaf = lastSlash + 1;
6161			if (!strcmp(leaf, ".."))
6162				return B_NOT_ALLOWED;
6163
6164			// omit multiple slashes
6165			while (lastSlash > path && lastSlash[-1] == '/')
6166				lastSlash--;
6167
6168			if (leaf[0]
6169				&& strcmp(leaf, ".")) {
6170				break;
6171			}
6172			// "name/" -> "name", or "name/." -> "name"
6173			lastSlash[0] = '\0';
6174		}
6175
6176		if (!strcmp(path, ".") || !strcmp(path, ".."))
6177			return B_NOT_ALLOWED;
6178	}
6179
6180	VnodePutter directory;
6181	status = fd_and_path_to_dir_vnode(fd, path, directory, name, kernel);
6182	if (status != B_OK)
6183		return status;
6184
6185	if (HAS_FS_CALL(directory, remove_dir))
6186		status = FS_CALL(directory.Get(), remove_dir, name);
6187	else
6188		status = B_READ_ONLY_DEVICE;
6189
6190	return status;
6191}
6192
6193
6194static status_t
6195common_ioctl(struct file_descriptor* descriptor, ulong op, void* buffer,
6196	size_t length)
6197{
6198	struct vnode* vnode = descriptor->u.vnode;
6199
6200	if (HAS_FS_CALL(vnode, ioctl))
6201		return FS_CALL(vnode, ioctl, descriptor->cookie, op, buffer, length);
6202
6203	return B_DEV_INVALID_IOCTL;
6204}
6205
6206
6207static status_t
6208common_fcntl(int fd, int op, size_t argument, bool kernel)
6209{
6210	struct flock flock;
6211
6212	FUNCTION(("common_fcntl(fd = %d, op = %d, argument = %lx, %s)\n",
6213		fd, op, argument, kernel ? "kernel" : "user"));
6214
6215	struct io_context* context = get_current_io_context(kernel);
6216
6217	FileDescriptorPutter descriptor(get_fd(context, fd));
6218	if (!descriptor.IsSet())
6219		return B_FILE_ERROR;
6220
6221	struct vnode* vnode = fd_vnode(descriptor.Get());
6222
6223	status_t status = B_OK;
6224
6225	if (op == F_SETLK || op == F_SETLKW || op == F_GETLK) {
6226		if (descriptor->ops != &sFileOps)
6227			status = B_BAD_VALUE;
6228		else if (kernel)
6229			memcpy(&flock, (struct flock*)argument, sizeof(struct flock));
6230		else if (user_memcpy(&flock, (struct flock*)argument,
6231				sizeof(struct flock)) != B_OK)
6232			status = B_BAD_ADDRESS;
6233		if (status != B_OK)
6234			return status;
6235	}
6236
6237	switch (op) {
6238		case F_SETFD:
6239		{
6240			// Set file descriptor flags
6241
6242			// O_CLOEXEC is the only flag available at this time
6243			mutex_lock(&context->io_mutex);
6244			fd_set_close_on_exec(context, fd, (argument & FD_CLOEXEC) != 0);
6245			mutex_unlock(&context->io_mutex);
6246
6247			status = B_OK;
6248			break;
6249		}
6250
6251		case F_GETFD:
6252		{
6253			// Get file descriptor flags
6254			mutex_lock(&context->io_mutex);
6255			status = fd_close_on_exec(context, fd) ? FD_CLOEXEC : 0;
6256			mutex_unlock(&context->io_mutex);
6257			break;
6258		}
6259
6260		case F_SETFL:
6261		{
6262			// Set file descriptor open mode
6263
6264			// we only accept changes to certain flags
6265			const int32 modifiableFlags = O_APPEND | O_NONBLOCK;
6266			argument &= modifiableFlags;
6267
6268			if (descriptor->ops->fd_set_flags != NULL) {
6269				status = descriptor->ops->fd_set_flags(descriptor.Get(), argument);
6270			} else if (vnode != NULL && HAS_FS_CALL(vnode, set_flags)) {
6271				status = FS_CALL(vnode, set_flags, descriptor->cookie,
6272					(int)argument);
6273			} else
6274				status = B_UNSUPPORTED;
6275
6276			if (status == B_OK) {
6277				// update this descriptor's open_mode field
6278				descriptor->open_mode = (descriptor->open_mode
6279					& ~modifiableFlags) | argument;
6280			}
6281
6282			break;
6283		}
6284
6285		case F_GETFL:
6286			// Get file descriptor open mode
6287			status = descriptor->open_mode;
6288			break;
6289
6290		case F_DUPFD:
6291		case F_DUPFD_CLOEXEC:
6292		{
6293			status = new_fd_etc(context, descriptor.Get(), (int)argument);
6294			if (status >= 0) {
6295				mutex_lock(&context->io_mutex);
6296				fd_set_close_on_exec(context, status, op == F_DUPFD_CLOEXEC);
6297				mutex_unlock(&context->io_mutex);
6298
6299				atomic_add(&descriptor->ref_count, 1);
6300			}
6301			break;
6302		}
6303
6304		case F_GETLK:
6305			if (vnode != NULL) {
6306				struct flock normalizedLock;
6307
6308				memcpy(&normalizedLock, &flock, sizeof(struct flock));
6309				status = normalize_flock(descriptor.Get(), &normalizedLock);
6310				if (status != B_OK)
6311					break;
6312
6313				if (HAS_FS_CALL(vnode, test_lock)) {
6314					status = FS_CALL(vnode, test_lock, descriptor->cookie,
6315						&normalizedLock);
6316				} else
6317					status = test_advisory_lock(vnode, &normalizedLock);
6318				if (status == B_OK) {
6319					if (normalizedLock.l_type == F_UNLCK) {
6320						// no conflicting lock found, copy back the same struct
6321						// we were given except change type to F_UNLCK
6322						flock.l_type = F_UNLCK;
6323						if (kernel) {
6324							memcpy((struct flock*)argument, &flock,
6325								sizeof(struct flock));
6326						} else {
6327							status = user_memcpy((struct flock*)argument,
6328								&flock, sizeof(struct flock));
6329						}
6330					} else {
6331						// a conflicting lock was found, copy back its range and
6332						// type
6333						if (normalizedLock.l_len == OFF_MAX)
6334							normalizedLock.l_len = 0;
6335
6336						if (kernel) {
6337							memcpy((struct flock*)argument,
6338								&normalizedLock, sizeof(struct flock));
6339						} else {
6340							status = user_memcpy((struct flock*)argument,
6341								&normalizedLock, sizeof(struct flock));
6342						}
6343					}
6344				}
6345			} else
6346				status = B_BAD_VALUE;
6347			break;
6348
6349		case F_SETLK:
6350		case F_SETLKW:
6351			status = normalize_flock(descriptor.Get(), &flock);
6352			if (status != B_OK)
6353				break;
6354
6355			if (vnode == NULL) {
6356				status = B_BAD_VALUE;
6357			} else if (flock.l_type == F_UNLCK) {
6358				if (HAS_FS_CALL(vnode, release_lock)) {
6359					status = FS_CALL(vnode, release_lock, descriptor->cookie,
6360						&flock);
6361				} else {
6362					status = release_advisory_lock(vnode, context, NULL,
6363						&flock);
6364				}
6365			} else {
6366				// the open mode must match the lock type
6367				if (((descriptor->open_mode & O_RWMASK) == O_RDONLY
6368						&& flock.l_type == F_WRLCK)
6369					|| ((descriptor->open_mode & O_RWMASK) == O_WRONLY
6370						&& flock.l_type == F_RDLCK))
6371					status = B_FILE_ERROR;
6372				else {
6373					if (HAS_FS_CALL(vnode, acquire_lock)) {
6374						status = FS_CALL(vnode, acquire_lock,
6375							descriptor->cookie, &flock, op == F_SETLKW);
6376					} else {
6377						status = acquire_advisory_lock(vnode, context, NULL,
6378							&flock, op == F_SETLKW);
6379					}
6380				}
6381			}
6382			break;
6383
6384		// ToDo: add support for more ops?
6385
6386		default:
6387			status = B_BAD_VALUE;
6388	}
6389
6390	return status;
6391}
6392
6393
6394static status_t
6395common_sync(int fd, bool kernel)
6396{
6397	FUNCTION(("common_fsync: entry. fd %d kernel %d\n", fd, kernel));
6398
6399	struct vnode* vnode;
6400	FileDescriptorPutter descriptor(get_fd_and_vnode(fd, &vnode, kernel));
6401	if (!descriptor.IsSet())
6402		return B_FILE_ERROR;
6403
6404	status_t status;
6405	if (HAS_FS_CALL(vnode, fsync))
6406		status = FS_CALL_NO_PARAMS(vnode, fsync);
6407	else
6408		status = B_UNSUPPORTED;
6409
6410	return status;
6411}
6412
6413
6414static status_t
6415common_lock_node(int fd, bool kernel)
6416{
6417	struct vnode* vnode;
6418	FileDescriptorPutter descriptor(get_fd_and_vnode(fd, &vnode, kernel));
6419	if (!descriptor.IsSet())
6420		return B_FILE_ERROR;
6421
6422	status_t status = B_OK;
6423
6424	// We need to set the locking atomically - someone
6425	// else might set one at the same time
6426	if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by,
6427			descriptor.Get(), (file_descriptor*)NULL) != NULL)
6428		status = B_BUSY;
6429
6430	return status;
6431}
6432
6433
6434static status_t
6435common_unlock_node(int fd, bool kernel)
6436{
6437	struct vnode* vnode;
6438	FileDescriptorPutter descriptor(get_fd_and_vnode(fd, &vnode, kernel));
6439	if (!descriptor.IsSet())
6440		return B_FILE_ERROR;
6441
6442	status_t status = B_OK;
6443
6444	// We need to set the locking atomically - someone
6445	// else might set one at the same time
6446	if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by,
6447			(file_descriptor*)NULL, descriptor.Get()) != descriptor.Get())
6448		status = B_BAD_VALUE;
6449
6450	return status;
6451}
6452
6453
6454static status_t
6455common_preallocate(int fd, off_t offset, off_t length, bool kernel)
6456{
6457	if (offset < 0 || length == 0)
6458		return B_BAD_VALUE;
6459	if (offset > OFF_MAX - length)
6460		return B_FILE_TOO_LARGE;
6461
6462	struct vnode* vnode;
6463	FileDescriptorPutter descriptor(get_fd_and_vnode(fd, &vnode, kernel));
6464	if (!descriptor.IsSet() || (descriptor->open_mode & O_RWMASK) == O_RDONLY)
6465		return B_FILE_ERROR;
6466
6467	switch (vnode->Type() & S_IFMT) {
6468		case S_IFIFO:
6469		case S_IFSOCK:
6470			return ESPIPE;
6471
6472		case S_IFBLK:
6473		case S_IFCHR:
6474		case S_IFDIR:
6475		case S_IFLNK:
6476			return B_DEVICE_NOT_FOUND;
6477
6478		case S_IFREG:
6479			break;
6480	}
6481
6482	status_t status = B_OK;
6483	if (HAS_FS_CALL(vnode, preallocate)) {
6484		status = FS_CALL(vnode, preallocate, offset, length);
6485	} else {
6486		status = HAS_FS_CALL(vnode, write)
6487			? B_UNSUPPORTED : B_READ_ONLY_DEVICE;
6488	}
6489
6490	return status;
6491}
6492
6493
6494static status_t
6495common_read_link(int fd, char* path, char* buffer, size_t* _bufferSize,
6496	bool kernel)
6497{
6498	VnodePutter vnode;
6499	status_t status;
6500
6501	status = fd_and_path_to_vnode(fd, path, false, vnode, NULL, kernel);
6502	if (status != B_OK)
6503		return status;
6504
6505	if (HAS_FS_CALL(vnode, read_symlink)) {
6506		status = FS_CALL(vnode.Get(), read_symlink, buffer, _bufferSize);
6507	} else
6508		status = B_BAD_VALUE;
6509
6510	return status;
6511}
6512
6513
6514static status_t
6515common_create_symlink(int fd, char* path, const char* toPath, int mode,
6516	bool kernel)
6517{
6518	// path validity checks have to be in the calling function!
6519	char name[B_FILE_NAME_LENGTH];
6520	status_t status;
6521
6522	FUNCTION(("common_create_symlink(fd = %d, path = %s, toPath = %s, "
6523		"mode = %d, kernel = %d)\n", fd, path, toPath, mode, kernel));
6524
6525	VnodePutter vnode;
6526	status = fd_and_path_to_dir_vnode(fd, path, vnode, name, kernel);
6527	if (status != B_OK)
6528		return status;
6529
6530	if (HAS_FS_CALL(vnode, create_symlink))
6531		status = FS_CALL(vnode.Get(), create_symlink, name, toPath, mode);
6532	else {
6533		status = HAS_FS_CALL(vnode, write)
6534			? B_UNSUPPORTED : B_READ_ONLY_DEVICE;
6535	}
6536
6537	return status;
6538}
6539
6540
6541static status_t
6542common_create_link(int pathFD, char* path, int toFD, char* toPath,
6543	bool traverseLeafLink, bool kernel)
6544{
6545	// path validity checks have to be in the calling function!
6546
6547	FUNCTION(("common_create_link(path = %s, toPath = %s, kernel = %d)\n", path,
6548		toPath, kernel));
6549
6550	char name[B_FILE_NAME_LENGTH];
6551	VnodePutter directory;
6552	status_t status = fd_and_path_to_dir_vnode(pathFD, path, directory, name,
6553		kernel);
6554	if (status != B_OK)
6555		return status;
6556
6557	VnodePutter vnode;
6558	status = fd_and_path_to_vnode(toFD, toPath, traverseLeafLink, vnode, NULL,
6559		kernel);
6560	if (status != B_OK)
6561		return status;
6562
6563	if (directory->mount != vnode->mount)
6564		return B_CROSS_DEVICE_LINK;
6565
6566	if (HAS_FS_CALL(directory, link))
6567		status = FS_CALL(directory.Get(), link, name, vnode.Get());
6568	else
6569		status = B_READ_ONLY_DEVICE;
6570
6571	return status;
6572}
6573
6574
6575static status_t
6576common_unlink(int fd, char* path, bool kernel)
6577{
6578	char filename[B_FILE_NAME_LENGTH];
6579	status_t status;
6580
6581	FUNCTION(("common_unlink: fd: %d, path '%s', kernel %d\n", fd, path,
6582		kernel));
6583
6584	VnodePutter vnode;
6585	status = fd_and_path_to_dir_vnode(fd, path, vnode, filename, kernel);
6586	if (status < 0)
6587		return status;
6588
6589	if (HAS_FS_CALL(vnode, unlink))
6590		status = FS_CALL(vnode.Get(), unlink, filename);
6591	else
6592		status = B_READ_ONLY_DEVICE;
6593
6594	return status;
6595}
6596
6597
6598static status_t
6599common_access(int fd, char* path, int mode, bool effectiveUserGroup, bool kernel)
6600{
6601	status_t status;
6602
6603	// TODO: honor effectiveUserGroup argument
6604
6605	VnodePutter vnode;
6606	status = fd_and_path_to_vnode(fd, path, true, vnode, NULL, kernel);
6607	if (status != B_OK)
6608		return status;
6609
6610	if (HAS_FS_CALL(vnode, access))
6611		status = FS_CALL(vnode.Get(), access, mode);
6612	else
6613		status = B_OK;
6614
6615	return status;
6616}
6617
6618
6619static status_t
6620common_rename(int fd, char* path, int newFD, char* newPath, bool kernel)
6621{
6622	status_t status;
6623
6624	FUNCTION(("common_rename(fd = %d, path = %s, newFD = %d, newPath = %s, "
6625		"kernel = %d)\n", fd, path, newFD, newPath, kernel));
6626
6627	VnodePutter fromVnode;
6628	char fromName[B_FILE_NAME_LENGTH];
6629	status = fd_and_path_to_dir_vnode(fd, path, fromVnode, fromName, kernel);
6630	if (status != B_OK)
6631		return status;
6632
6633	VnodePutter toVnode;
6634	char toName[B_FILE_NAME_LENGTH];
6635	status = fd_and_path_to_dir_vnode(newFD, newPath, toVnode, toName, kernel);
6636	if (status != B_OK)
6637		return status;
6638
6639	if (fromVnode->device != toVnode->device)
6640		return B_CROSS_DEVICE_LINK;
6641
6642	if (fromVnode.Get() == toVnode.Get() && !strcmp(fromName, toName))
6643		return B_OK;
6644
6645	if (fromName[0] == '\0' || toName[0] == '\0'
6646		|| !strcmp(fromName, ".") || !strcmp(fromName, "..")
6647		|| !strcmp(toName, ".") || !strcmp(toName, "..")) {
6648		return B_BAD_VALUE;
6649	}
6650
6651	if (HAS_FS_CALL(fromVnode, rename))
6652		status = FS_CALL(fromVnode.Get(), rename, fromName, toVnode.Get(), toName);
6653	else
6654		status = B_READ_ONLY_DEVICE;
6655
6656	return status;
6657}
6658
6659
6660static status_t
6661common_read_stat(struct file_descriptor* descriptor, struct stat* stat)
6662{
6663	struct vnode* vnode = descriptor->u.vnode;
6664
6665	FUNCTION(("common_read_stat: stat %p\n", stat));
6666
6667	// TODO: remove this once all file systems properly set them!
6668	stat->st_crtim.tv_nsec = 0;
6669	stat->st_ctim.tv_nsec = 0;
6670	stat->st_mtim.tv_nsec = 0;
6671	stat->st_atim.tv_nsec = 0;
6672
6673	return vfs_stat_vnode(vnode, stat);
6674}
6675
6676
6677static status_t
6678common_write_stat(struct file_descriptor* descriptor, const struct stat* stat,
6679	int statMask)
6680{
6681	struct vnode* vnode = descriptor->u.vnode;
6682
6683	FUNCTION(("common_write_stat(vnode = %p, stat = %p, statMask = %d)\n",
6684		vnode, stat, statMask));
6685
6686	if ((descriptor->open_mode & O_RWMASK) == O_RDONLY
6687		&& (statMask & B_STAT_SIZE) != 0) {
6688		return B_BAD_VALUE;
6689	}
6690
6691	if (!HAS_FS_CALL(vnode, write_stat))
6692		return B_READ_ONLY_DEVICE;
6693
6694	return FS_CALL(vnode, write_stat, stat, statMask);
6695}
6696
6697
6698static status_t
6699common_path_read_stat(int fd, char* path, bool traverseLeafLink,
6700	struct stat* stat, bool kernel)
6701{
6702	FUNCTION(("common_path_read_stat: fd: %d, path '%s', stat %p,\n", fd, path,
6703		stat));
6704
6705	VnodePutter vnode;
6706	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, vnode,
6707		NULL, kernel);
6708	if (status != B_OK)
6709		return status;
6710
6711	status = vfs_stat_vnode(vnode.Get(), stat);
6712
6713	return status;
6714}
6715
6716
6717static status_t
6718common_path_write_stat(int fd, char* path, bool traverseLeafLink,
6719	const struct stat* stat, int statMask, bool kernel)
6720{
6721	FUNCTION(("common_write_stat: fd: %d, path '%s', stat %p, stat_mask %d, "
6722		"kernel %d\n", fd, path, stat, statMask, kernel));
6723
6724	VnodePutter vnode;
6725	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, vnode,
6726		NULL, kernel);
6727	if (status != B_OK)
6728		return status;
6729
6730	if (HAS_FS_CALL(vnode, write_stat))
6731		status = FS_CALL(vnode.Get(), write_stat, stat, statMask);
6732	else
6733		status = B_READ_ONLY_DEVICE;
6734
6735	return status;
6736}
6737
6738
6739static int
6740attr_dir_open(int fd, char* path, bool traverseLeafLink, bool kernel)
6741{
6742	FUNCTION(("attr_dir_open(fd = %d, path = '%s', kernel = %d)\n", fd, path,
6743		kernel));
6744
6745	VnodePutter vnode;
6746	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, vnode,
6747		NULL, kernel);
6748	if (status != B_OK)
6749		return status;
6750
6751	status = open_attr_dir_vnode(vnode.Get(), kernel);
6752	if (status >= 0)
6753		vnode.Detach();
6754
6755	return status;
6756}
6757
6758
6759static status_t
6760attr_dir_close(struct file_descriptor* descriptor)
6761{
6762	struct vnode* vnode = descriptor->u.vnode;
6763
6764	FUNCTION(("attr_dir_close(descriptor = %p)\n", descriptor));
6765
6766	if (HAS_FS_CALL(vnode, close_attr_dir))
6767		return FS_CALL(vnode, close_attr_dir, descriptor->cookie);
6768
6769	return B_OK;
6770}
6771
6772
6773static void
6774attr_dir_free_fd(struct file_descriptor* descriptor)
6775{
6776	struct vnode* vnode = descriptor->u.vnode;
6777
6778	if (vnode != NULL) {
6779		FS_CALL(vnode, free_attr_dir_cookie, descriptor->cookie);
6780		put_vnode(vnode);
6781	}
6782}
6783
6784
6785static status_t
6786attr_dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
6787	struct dirent* buffer, size_t bufferSize, uint32* _count)
6788{
6789	struct vnode* vnode = descriptor->u.vnode;
6790
6791	FUNCTION(("attr_dir_read(descriptor = %p)\n", descriptor));
6792
6793	if (HAS_FS_CALL(vnode, read_attr_dir))
6794		return FS_CALL(vnode, read_attr_dir, descriptor->cookie, buffer,
6795			bufferSize, _count);
6796
6797	return B_UNSUPPORTED;
6798}
6799
6800
6801static status_t
6802attr_dir_rewind(struct file_descriptor* descriptor)
6803{
6804	struct vnode* vnode = descriptor->u.vnode;
6805
6806	FUNCTION(("attr_dir_rewind(descriptor = %p)\n", descriptor));
6807
6808	if (HAS_FS_CALL(vnode, rewind_attr_dir))
6809		return FS_CALL(vnode, rewind_attr_dir, descriptor->cookie);
6810
6811	return B_UNSUPPORTED;
6812}
6813
6814
6815static int
6816attr_create(int fd, char* path, const char* name, uint32 type,
6817	int openMode, bool kernel)
6818{
6819	if (name == NULL || *name == '\0')
6820		return B_BAD_VALUE;
6821
6822	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
6823	VnodePutter vnode;
6824	status_t status = fd_and_path_to_vnode(fd, path, traverse, vnode, NULL,
6825		kernel);
6826	if (status != B_OK)
6827		return status;
6828
6829	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type()))
6830		return B_LINK_LIMIT;
6831
6832	if (!HAS_FS_CALL(vnode, create_attr))
6833		return B_READ_ONLY_DEVICE;
6834
6835	void* cookie;
6836	status = FS_CALL(vnode.Get(), create_attr, name, type, openMode, &cookie);
6837	if (status != B_OK)
6838		return status;
6839
6840	fd = get_new_fd(&sAttributeOps, NULL, vnode.Get(), cookie, openMode, kernel);
6841	if (fd >= 0) {
6842		vnode.Detach();
6843		return fd;
6844	}
6845
6846	status = fd;
6847
6848	FS_CALL(vnode.Get(), close_attr, cookie);
6849	FS_CALL(vnode.Get(), free_attr_cookie, cookie);
6850
6851	FS_CALL(vnode.Get(), remove_attr, name);
6852
6853	return status;
6854}
6855
6856
6857static int
6858attr_open(int fd, char* path, const char* name, int openMode, bool kernel)
6859{
6860	if (name == NULL || *name == '\0')
6861		return B_BAD_VALUE;
6862
6863	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
6864	VnodePutter vnode;
6865	status_t status = fd_and_path_to_vnode(fd, path, traverse, vnode, NULL,
6866		kernel);
6867	if (status != B_OK)
6868		return status;
6869
6870	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type()))
6871		return B_LINK_LIMIT;
6872
6873	if (!HAS_FS_CALL(vnode, open_attr))
6874		return B_UNSUPPORTED;
6875
6876	void* cookie;
6877	status = FS_CALL(vnode.Get(), open_attr, name, openMode, &cookie);
6878	if (status != B_OK)
6879		return status;
6880
6881	// now we only need a file descriptor for this attribute and we're done
6882	fd = get_new_fd(&sAttributeOps, NULL, vnode.Get(), cookie, openMode, kernel);
6883	if (fd >= 0) {
6884		vnode.Detach();
6885		return fd;
6886	}
6887
6888	status = fd;
6889
6890	FS_CALL(vnode.Get(), close_attr, cookie);
6891	FS_CALL(vnode.Get(), free_attr_cookie, cookie);
6892
6893	return status;
6894}
6895
6896
6897static status_t
6898attr_close(struct file_descriptor* descriptor)
6899{
6900	struct vnode* vnode = descriptor->u.vnode;
6901
6902	FUNCTION(("attr_close(descriptor = %p)\n", descriptor));
6903
6904	if (HAS_FS_CALL(vnode, close_attr))
6905		return FS_CALL(vnode, close_attr, descriptor->cookie);
6906
6907	return B_OK;
6908}
6909
6910
6911static void
6912attr_free_fd(struct file_descriptor* descriptor)
6913{
6914	struct vnode* vnode = descriptor->u.vnode;
6915
6916	if (vnode != NULL) {
6917		FS_CALL(vnode, free_attr_cookie, descriptor->cookie);
6918		put_vnode(vnode);
6919	}
6920}
6921
6922
6923static status_t
6924attr_read(struct file_descriptor* descriptor, off_t pos, void* buffer,
6925	size_t* length)
6926{
6927	struct vnode* vnode = descriptor->u.vnode;
6928
6929	FUNCTION(("attr_read: buf %p, pos %" B_PRIdOFF ", len %p = %ld\n", buffer,
6930		pos, length, *length));
6931
6932	if (!HAS_FS_CALL(vnode, read_attr))
6933		return B_UNSUPPORTED;
6934
6935	return FS_CALL(vnode, read_attr, descriptor->cookie, pos, buffer, length);
6936}
6937
6938
6939static status_t
6940attr_write(struct file_descriptor* descriptor, off_t pos, const void* buffer,
6941	size_t* length)
6942{
6943	struct vnode* vnode = descriptor->u.vnode;
6944
6945	FUNCTION(("attr_write: buf %p, pos %" B_PRIdOFF ", len %p\n", buffer, pos,
6946		length));
6947
6948	if (!HAS_FS_CALL(vnode, write_attr))
6949		return B_UNSUPPORTED;
6950
6951	return FS_CALL(vnode, write_attr, descriptor->cookie, pos, buffer, length);
6952}
6953
6954
6955static off_t
6956attr_seek(struct file_descriptor* descriptor, off_t pos, int seekType)
6957{
6958	off_t offset;
6959
6960	switch (seekType) {
6961		case SEEK_SET:
6962			offset = 0;
6963			break;
6964		case SEEK_CUR:
6965			offset = descriptor->pos;
6966			break;
6967		case SEEK_END:
6968		{
6969			struct vnode* vnode = descriptor->u.vnode;
6970			if (!HAS_FS_CALL(vnode, read_stat))
6971				return B_UNSUPPORTED;
6972
6973			struct stat stat;
6974			status_t status = FS_CALL(vnode, read_attr_stat, descriptor->cookie,
6975				&stat);
6976			if (status != B_OK)
6977				return status;
6978
6979			offset = stat.st_size;
6980			break;
6981		}
6982		default:
6983			return B_BAD_VALUE;
6984	}
6985
6986	// assumes off_t is 64 bits wide
6987	if (offset > 0 && LONGLONG_MAX - offset < pos)
6988		return B_BUFFER_OVERFLOW;
6989
6990	pos += offset;
6991	if (pos < 0)
6992		return B_BAD_VALUE;
6993
6994	return descriptor->pos = pos;
6995}
6996
6997
6998static status_t
6999attr_read_stat(struct file_descriptor* descriptor, struct stat* stat)
7000{
7001	struct vnode* vnode = descriptor->u.vnode;
7002
7003	FUNCTION(("attr_read_stat: stat 0x%p\n", stat));
7004
7005	if (!HAS_FS_CALL(vnode, read_attr_stat))
7006		return B_UNSUPPORTED;
7007
7008	return FS_CALL(vnode, read_attr_stat, descriptor->cookie, stat);
7009}
7010
7011
7012static status_t
7013attr_write_stat(struct file_descriptor* descriptor, const struct stat* stat,
7014	int statMask)
7015{
7016	struct vnode* vnode = descriptor->u.vnode;
7017
7018	FUNCTION(("attr_write_stat: stat = %p, statMask %d\n", stat, statMask));
7019
7020	if (!HAS_FS_CALL(vnode, write_attr_stat))
7021		return B_READ_ONLY_DEVICE;
7022
7023	return FS_CALL(vnode, write_attr_stat, descriptor->cookie, stat, statMask);
7024}
7025
7026
7027static status_t
7028attr_remove(int fd, const char* name, bool kernel)
7029{
7030	if (name == NULL || *name == '\0')
7031		return B_BAD_VALUE;
7032
7033	FUNCTION(("attr_remove: fd = %d, name = \"%s\", kernel %d\n", fd, name,
7034		kernel));
7035
7036	struct vnode* vnode;
7037	FileDescriptorPutter descriptor(get_fd_and_vnode(fd, &vnode, kernel));
7038	if (!descriptor.IsSet())
7039		return B_FILE_ERROR;
7040
7041	status_t status;
7042	if (HAS_FS_CALL(vnode, remove_attr))
7043		status = FS_CALL(vnode, remove_attr, name);
7044	else
7045		status = B_READ_ONLY_DEVICE;
7046
7047	return status;
7048}
7049
7050
7051static status_t
7052attr_rename(int fromFD, const char* fromName, int toFD, const char* toName,
7053	bool kernel)
7054{
7055	if (fromName == NULL || *fromName == '\0' || toName == NULL
7056		|| *toName == '\0')
7057		return B_BAD_VALUE;
7058
7059	FUNCTION(("attr_rename: from fd = %d, from name = \"%s\", to fd = %d, to "
7060		"name = \"%s\", kernel %d\n", fromFD, fromName, toFD, toName, kernel));
7061
7062	struct vnode* fromVnode;
7063	FileDescriptorPutter fromDescriptor(get_fd_and_vnode(fromFD, &fromVnode, kernel));
7064	if (!fromDescriptor.IsSet())
7065		return B_FILE_ERROR;
7066
7067	struct vnode* toVnode;
7068	FileDescriptorPutter toDescriptor(get_fd_and_vnode(toFD, &toVnode, kernel));
7069	if (!toDescriptor.IsSet())
7070		return B_FILE_ERROR;
7071
7072	// are the files on the same volume?
7073	if (fromVnode->device != toVnode->device)
7074		return B_CROSS_DEVICE_LINK;
7075
7076	status_t status;
7077	if (HAS_FS_CALL(fromVnode, rename_attr)) {
7078		status = FS_CALL(fromVnode, rename_attr, fromName, toVnode, toName);
7079	} else
7080		status = B_READ_ONLY_DEVICE;
7081
7082	return status;
7083}
7084
7085
7086static int
7087index_dir_open(dev_t mountID, bool kernel)
7088{
7089	struct fs_mount* mount;
7090	void* cookie;
7091
7092	FUNCTION(("index_dir_open(mountID = %" B_PRId32 ", kernel = %d)\n", mountID,
7093		kernel));
7094
7095	status_t status = get_mount(mountID, &mount);
7096	if (status != B_OK)
7097		return status;
7098
7099	if (!HAS_FS_MOUNT_CALL(mount, open_index_dir)) {
7100		status = B_UNSUPPORTED;
7101		goto error;
7102	}
7103
7104	status = FS_MOUNT_CALL(mount, open_index_dir, &cookie);
7105	if (status != B_OK)
7106		goto error;
7107
7108	// get fd for the index directory
7109	int fd;
7110	fd = get_new_fd(&sIndexDirectoryOps, mount, NULL, cookie, O_CLOEXEC, kernel);
7111	if (fd >= 0)
7112		return fd;
7113
7114	// something went wrong
7115	FS_MOUNT_CALL(mount, close_index_dir, cookie);
7116	FS_MOUNT_CALL(mount, free_index_dir_cookie, cookie);
7117
7118	status = fd;
7119
7120error:
7121	put_mount(mount);
7122	return status;
7123}
7124
7125
7126static status_t
7127index_dir_close(struct file_descriptor* descriptor)
7128{
7129	struct fs_mount* mount = descriptor->u.mount;
7130
7131	FUNCTION(("index_dir_close(descriptor = %p)\n", descriptor));
7132
7133	if (HAS_FS_MOUNT_CALL(mount, close_index_dir))
7134		return FS_MOUNT_CALL(mount, close_index_dir, descriptor->cookie);
7135
7136	return B_OK;
7137}
7138
7139
7140static void
7141index_dir_free_fd(struct file_descriptor* descriptor)
7142{
7143	struct fs_mount* mount = descriptor->u.mount;
7144
7145	if (mount != NULL) {
7146		FS_MOUNT_CALL(mount, free_index_dir_cookie, descriptor->cookie);
7147		put_mount(mount);
7148	}
7149}
7150
7151
7152static status_t
7153index_dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
7154	struct dirent* buffer, size_t bufferSize, uint32* _count)
7155{
7156	struct fs_mount* mount = descriptor->u.mount;
7157
7158	if (HAS_FS_MOUNT_CALL(mount, read_index_dir)) {
7159		return FS_MOUNT_CALL(mount, read_index_dir, descriptor->cookie, buffer,
7160			bufferSize, _count);
7161	}
7162
7163	return B_UNSUPPORTED;
7164}
7165
7166
7167static status_t
7168index_dir_rewind(struct file_descriptor* descriptor)
7169{
7170	struct fs_mount* mount = descriptor->u.mount;
7171
7172	if (HAS_FS_MOUNT_CALL(mount, rewind_index_dir))
7173		return FS_MOUNT_CALL(mount, rewind_index_dir, descriptor->cookie);
7174
7175	return B_UNSUPPORTED;
7176}
7177
7178
7179static status_t
7180index_create(dev_t mountID, const char* name, uint32 type, uint32 flags,
7181	bool kernel)
7182{
7183	FUNCTION(("index_create(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7184		mountID, name, kernel));
7185
7186	struct fs_mount* mount;
7187	status_t status = get_mount(mountID, &mount);
7188	if (status != B_OK)
7189		return status;
7190
7191	if (!HAS_FS_MOUNT_CALL(mount, create_index)) {
7192		status = B_READ_ONLY_DEVICE;
7193		goto out;
7194	}
7195
7196	status = FS_MOUNT_CALL(mount, create_index, name, type, flags);
7197
7198out:
7199	put_mount(mount);
7200	return status;
7201}
7202
7203
7204#if 0
7205static status_t
7206index_read_stat(struct file_descriptor* descriptor, struct stat* stat)
7207{
7208	struct vnode* vnode = descriptor->u.vnode;
7209
7210	// ToDo: currently unused!
7211	FUNCTION(("index_read_stat: stat 0x%p\n", stat));
7212	if (!HAS_FS_CALL(vnode, read_index_stat))
7213		return B_UNSUPPORTED;
7214
7215	return B_UNSUPPORTED;
7216	//return FS_CALL(vnode, read_index_stat, descriptor->cookie, stat);
7217}
7218
7219
7220static void
7221index_free_fd(struct file_descriptor* descriptor)
7222{
7223	struct vnode* vnode = descriptor->u.vnode;
7224
7225	if (vnode != NULL) {
7226		FS_CALL(vnode, free_index_cookie, descriptor->cookie);
7227		put_vnode(vnode);
7228	}
7229}
7230#endif
7231
7232
7233static status_t
7234index_name_read_stat(dev_t mountID, const char* name, struct stat* stat,
7235	bool kernel)
7236{
7237	FUNCTION(("index_remove(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7238		mountID, name, kernel));
7239
7240	struct fs_mount* mount;
7241	status_t status = get_mount(mountID, &mount);
7242	if (status != B_OK)
7243		return status;
7244
7245	if (!HAS_FS_MOUNT_CALL(mount, read_index_stat)) {
7246		status = B_UNSUPPORTED;
7247		goto out;
7248	}
7249
7250	status = FS_MOUNT_CALL(mount, read_index_stat, name, stat);
7251
7252out:
7253	put_mount(mount);
7254	return status;
7255}
7256
7257
7258static status_t
7259index_remove(dev_t mountID, const char* name, bool kernel)
7260{
7261	FUNCTION(("index_remove(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7262		mountID, name, kernel));
7263
7264	struct fs_mount* mount;
7265	status_t status = get_mount(mountID, &mount);
7266	if (status != B_OK)
7267		return status;
7268
7269	if (!HAS_FS_MOUNT_CALL(mount, remove_index)) {
7270		status = B_READ_ONLY_DEVICE;
7271		goto out;
7272	}
7273
7274	status = FS_MOUNT_CALL(mount, remove_index, name);
7275
7276out:
7277	put_mount(mount);
7278	return status;
7279}
7280
7281
7282/*!	TODO: the query FS API is still the pretty much the same as in R5.
7283		It would be nice if the FS would find some more kernel support
7284		for them.
7285		For example, query parsing should be moved into the kernel.
7286*/
7287static int
7288query_open(dev_t device, const char* query, uint32 flags, port_id port,
7289	int32 token, bool kernel)
7290{
7291	struct fs_mount* mount;
7292	void* cookie;
7293
7294	FUNCTION(("query_open(device = %" B_PRId32 ", query = \"%s\", kernel = %d)\n",
7295		device, query, kernel));
7296
7297	status_t status = get_mount(device, &mount);
7298	if (status != B_OK)
7299		return status;
7300
7301	if (!HAS_FS_MOUNT_CALL(mount, open_query)) {
7302		status = B_UNSUPPORTED;
7303		goto error;
7304	}
7305
7306	status = FS_MOUNT_CALL(mount, open_query, query, flags, port, token,
7307		&cookie);
7308	if (status != B_OK)
7309		goto error;
7310
7311	// get fd for the index directory
7312	int fd;
7313	fd = get_new_fd(&sQueryOps, mount, NULL, cookie, O_CLOEXEC, kernel);
7314	if (fd >= 0)
7315		return fd;
7316
7317	status = fd;
7318
7319	// something went wrong
7320	FS_MOUNT_CALL(mount, close_query, cookie);
7321	FS_MOUNT_CALL(mount, free_query_cookie, cookie);
7322
7323error:
7324	put_mount(mount);
7325	return status;
7326}
7327
7328
7329static status_t
7330query_close(struct file_descriptor* descriptor)
7331{
7332	struct fs_mount* mount = descriptor->u.mount;
7333
7334	FUNCTION(("query_close(descriptor = %p)\n", descriptor));
7335
7336	if (HAS_FS_MOUNT_CALL(mount, close_query))
7337		return FS_MOUNT_CALL(mount, close_query, descriptor->cookie);
7338
7339	return B_OK;
7340}
7341
7342
7343static void
7344query_free_fd(struct file_descriptor* descriptor)
7345{
7346	struct fs_mount* mount = descriptor->u.mount;
7347
7348	if (mount != NULL) {
7349		FS_MOUNT_CALL(mount, free_query_cookie, descriptor->cookie);
7350		put_mount(mount);
7351	}
7352}
7353
7354
7355static status_t
7356query_read(struct io_context* ioContext, struct file_descriptor* descriptor,
7357	struct dirent* buffer, size_t bufferSize, uint32* _count)
7358{
7359	struct fs_mount* mount = descriptor->u.mount;
7360
7361	if (HAS_FS_MOUNT_CALL(mount, read_query)) {
7362		return FS_MOUNT_CALL(mount, read_query, descriptor->cookie, buffer,
7363			bufferSize, _count);
7364	}
7365
7366	return B_UNSUPPORTED;
7367}
7368
7369
7370static status_t
7371query_rewind(struct file_descriptor* descriptor)
7372{
7373	struct fs_mount* mount = descriptor->u.mount;
7374
7375	if (HAS_FS_MOUNT_CALL(mount, rewind_query))
7376		return FS_MOUNT_CALL(mount, rewind_query, descriptor->cookie);
7377
7378	return B_UNSUPPORTED;
7379}
7380
7381
7382//	#pragma mark - General File System functions
7383
7384
7385static dev_t
7386fs_mount(char* path, const char* device, const char* fsName, uint32 flags,
7387	const char* args, bool kernel)
7388{
7389	struct ::fs_mount* mount;
7390	status_t status = B_OK;
7391	fs_volume* volume = NULL;
7392	int32 layer = 0;
7393	Vnode* coveredNode = NULL;
7394
7395	FUNCTION(("fs_mount: path = '%s', device = '%s', fs_name = '%s', flags = %#"
7396		B_PRIx32 ", args = '%s'\n", path, device, fsName, flags, args));
7397
7398	// The path is always safe, we just have to make sure that fsName is
7399	// almost valid - we can't make any assumptions about args, though.
7400	// A NULL fsName is OK, if a device was given and the FS is not virtual.
7401	// We'll get it from the DDM later.
7402	if (fsName == NULL) {
7403		if (!device || flags & B_MOUNT_VIRTUAL_DEVICE)
7404			return B_BAD_VALUE;
7405	} else if (fsName[0] == '\0')
7406		return B_BAD_VALUE;
7407
7408	RecursiveLocker mountOpLocker(sMountOpLock);
7409
7410	// Helper to delete a newly created file device on failure.
7411	// Not exactly beautiful, but helps to keep the code below cleaner.
7412	struct FileDeviceDeleter {
7413		FileDeviceDeleter() : id(-1) {}
7414		~FileDeviceDeleter()
7415		{
7416			KDiskDeviceManager::Default()->DeleteFileDevice(id);
7417		}
7418
7419		partition_id id;
7420	} fileDeviceDeleter;
7421
7422	// If the file system is not a "virtual" one, the device argument should
7423	// point to a real file/device (if given at all).
7424	// get the partition
7425	KDiskDeviceManager* ddm = KDiskDeviceManager::Default();
7426	KPartition* partition = NULL;
7427	KPath normalizedDevice;
7428	bool newlyCreatedFileDevice = false;
7429
7430	if (!(flags & B_MOUNT_VIRTUAL_DEVICE) && device != NULL) {
7431		// normalize the device path
7432		status = normalizedDevice.SetTo(device, true);
7433		if (status != B_OK)
7434			return status;
7435
7436		// get a corresponding partition from the DDM
7437		partition = ddm->RegisterPartition(normalizedDevice.Path());
7438		if (partition == NULL) {
7439			// Partition not found: This either means, the user supplied
7440			// an invalid path, or the path refers to an image file. We try
7441			// to let the DDM create a file device for the path.
7442			partition_id deviceID = ddm->CreateFileDevice(
7443				normalizedDevice.Path(), &newlyCreatedFileDevice);
7444			if (deviceID >= 0) {
7445				partition = ddm->RegisterPartition(deviceID);
7446				if (newlyCreatedFileDevice)
7447					fileDeviceDeleter.id = deviceID;
7448			}
7449		}
7450
7451		if (!partition) {
7452			TRACE(("fs_mount(): Partition `%s' not found.\n",
7453				normalizedDevice.Path()));
7454			return B_ENTRY_NOT_FOUND;
7455		}
7456
7457		device = normalizedDevice.Path();
7458			// correct path to file device
7459	}
7460	PartitionRegistrar partitionRegistrar(partition, true);
7461
7462	// Write lock the partition's device. For the time being, we keep the lock
7463	// until we're done mounting -- not nice, but ensure, that no-one is
7464	// interfering.
7465	// TODO: Just mark the partition busy while mounting!
7466	KDiskDevice* diskDevice = NULL;
7467	if (partition) {
7468		diskDevice = ddm->WriteLockDevice(partition->Device()->ID());
7469		if (!diskDevice) {
7470			TRACE(("fs_mount(): Failed to lock disk device!\n"));
7471			return B_ERROR;
7472		}
7473	}
7474
7475	DeviceWriteLocker writeLocker(diskDevice, true);
7476		// this takes over the write lock acquired before
7477
7478	if (partition != NULL) {
7479		// make sure, that the partition is not busy
7480		if (partition->IsBusy()) {
7481			TRACE(("fs_mount(): Partition is busy.\n"));
7482			return B_BUSY;
7483		}
7484
7485		// if no FS name had been supplied, we get it from the partition
7486		if (fsName == NULL) {
7487			KDiskSystem* diskSystem = partition->DiskSystem();
7488			if (!diskSystem) {
7489				TRACE(("fs_mount(): No FS name was given, and the DDM didn't "
7490					"recognize it.\n"));
7491				return B_BAD_VALUE;
7492			}
7493
7494			if (!diskSystem->IsFileSystem()) {
7495				TRACE(("fs_mount(): No FS name was given, and the DDM found a "
7496					"partitioning system.\n"));
7497				return B_BAD_VALUE;
7498			}
7499
7500			// The disk system name will not change, and the KDiskSystem
7501			// object will not go away while the disk device is locked (and
7502			// the partition has a reference to it), so this is safe.
7503			fsName = diskSystem->Name();
7504		}
7505	}
7506
7507	mount = new(std::nothrow) (struct ::fs_mount);
7508	if (mount == NULL)
7509		return B_NO_MEMORY;
7510
7511	mount->device_name = strdup(device);
7512		// "device" can be NULL
7513
7514	status = mount->entry_cache.Init();
7515	if (status != B_OK)
7516		goto err1;
7517
7518	// initialize structure
7519	mount->id = sNextMountID++;
7520	mount->partition = NULL;
7521	mount->root_vnode = NULL;
7522	mount->covers_vnode = NULL;
7523	mount->unmounting = false;
7524	mount->owns_file_device = false;
7525	mount->volume = NULL;
7526
7527	// build up the volume(s)
7528	while (true) {
7529		char* layerFSName = get_file_system_name_for_layer(fsName, layer);
7530		if (layerFSName == NULL) {
7531			if (layer == 0) {
7532				status = B_NO_MEMORY;
7533				goto err1;
7534			}
7535
7536			break;
7537		}
7538		MemoryDeleter layerFSNameDeleter(layerFSName);
7539
7540		volume = (fs_volume*)malloc(sizeof(fs_volume));
7541		if (volume == NULL) {
7542			status = B_NO_MEMORY;
7543			goto err1;
7544		}
7545
7546		volume->id = mount->id;
7547		volume->partition = partition != NULL ? partition->ID() : -1;
7548		volume->layer = layer++;
7549		volume->private_volume = NULL;
7550		volume->ops = NULL;
7551		volume->sub_volume = NULL;
7552		volume->super_volume = NULL;
7553		volume->file_system = NULL;
7554		volume->file_system_name = NULL;
7555
7556		volume->file_system_name = get_file_system_name(layerFSName);
7557		if (volume->file_system_name == NULL) {
7558			status = B_NO_MEMORY;
7559			free(volume);
7560			goto err1;
7561		}
7562
7563		volume->file_system = get_file_system(layerFSName);
7564		if (volume->file_system == NULL) {
7565			status = B_DEVICE_NOT_FOUND;
7566			free(volume->file_system_name);
7567			free(volume);
7568			goto err1;
7569		}
7570
7571		if (mount->volume == NULL)
7572			mount->volume = volume;
7573		else {
7574			volume->super_volume = mount->volume;
7575			mount->volume->sub_volume = volume;
7576			mount->volume = volume;
7577		}
7578	}
7579
7580	// insert mount struct into list before we call FS's mount() function
7581	// so that vnodes can be created for this mount
7582	rw_lock_write_lock(&sMountLock);
7583	sMountsTable->Insert(mount);
7584	rw_lock_write_unlock(&sMountLock);
7585
7586	ino_t rootID;
7587
7588	if (!sRoot) {
7589		// we haven't mounted anything yet
7590		if (strcmp(path, "/") != 0) {
7591			status = B_ERROR;
7592			goto err2;
7593		}
7594
7595		status = mount->volume->file_system->mount(mount->volume, device, flags,
7596			args, &rootID);
7597		if (status != B_OK || mount->volume->ops == NULL)
7598			goto err2;
7599	} else {
7600		{
7601			VnodePutter temp;
7602			status = path_to_vnode(path, true, temp, NULL, kernel);
7603			coveredNode = temp.Detach();
7604		}
7605		if (status != B_OK)
7606			goto err2;
7607
7608		mount->covers_vnode = coveredNode;
7609
7610		// make sure covered_vnode is a directory
7611		if (!S_ISDIR(coveredNode->Type())) {
7612			status = B_NOT_A_DIRECTORY;
7613			goto err3;
7614		}
7615
7616		if (coveredNode->IsCovered()) {
7617			// this is already a covered vnode
7618			status = B_BUSY;
7619			goto err3;
7620		}
7621
7622		// mount it/them
7623		fs_volume* volume = mount->volume;
7624		while (volume) {
7625			status = volume->file_system->mount(volume, device, flags, args,
7626				&rootID);
7627			if (status != B_OK || volume->ops == NULL) {
7628				if (status == B_OK && volume->ops == NULL)
7629					panic("fs_mount: mount() succeeded but ops is NULL!");
7630				if (volume->sub_volume)
7631					goto err4;
7632				goto err3;
7633			}
7634
7635			volume = volume->super_volume;
7636		}
7637
7638		volume = mount->volume;
7639		while (volume) {
7640			if (volume->ops->all_layers_mounted != NULL)
7641				volume->ops->all_layers_mounted(volume);
7642			volume = volume->super_volume;
7643		}
7644	}
7645
7646	// the root node is supposed to be owned by the file system - it must
7647	// exist at this point
7648	rw_lock_write_lock(&sVnodeLock);
7649	mount->root_vnode = lookup_vnode(mount->id, rootID);
7650	if (mount->root_vnode == NULL || mount->root_vnode->ref_count != 1) {
7651		panic("fs_mount: file system does not own its root node!\n");
7652		status = B_ERROR;
7653		rw_lock_write_unlock(&sVnodeLock);
7654		goto err4;
7655	}
7656
7657	// set up the links between the root vnode and the vnode it covers
7658	if (coveredNode != NULL) {
7659		if (coveredNode->IsCovered()) {
7660			// the vnode is covered now
7661			status = B_BUSY;
7662			rw_lock_write_unlock(&sVnodeLock);
7663			goto err4;
7664		}
7665
7666		mount->root_vnode->covers = coveredNode;
7667		mount->root_vnode->SetCovering(true);
7668
7669		coveredNode->covered_by = mount->root_vnode;
7670		coveredNode->SetCovered(true);
7671	}
7672	rw_lock_write_unlock(&sVnodeLock);
7673
7674	if (!sRoot) {
7675		sRoot = mount->root_vnode;
7676		mutex_lock(&sIOContextRootLock);
7677		get_current_io_context(true)->root = sRoot;
7678		mutex_unlock(&sIOContextRootLock);
7679		inc_vnode_ref_count(sRoot);
7680	}
7681
7682	// supply the partition (if any) with the mount cookie and mark it mounted
7683	if (partition) {
7684		partition->SetMountCookie(mount->volume->private_volume);
7685		partition->SetVolumeID(mount->id);
7686
7687		// keep a partition reference as long as the partition is mounted
7688		partitionRegistrar.Detach();
7689		mount->partition = partition;
7690		mount->owns_file_device = newlyCreatedFileDevice;
7691		fileDeviceDeleter.id = -1;
7692	}
7693
7694	notify_mount(mount->id,
7695		coveredNode != NULL ? coveredNode->device : -1,
7696		coveredNode ? coveredNode->id : -1);
7697
7698	return mount->id;
7699
7700err4:
7701	FS_MOUNT_CALL_NO_PARAMS(mount, unmount);
7702err3:
7703	if (coveredNode != NULL)
7704		put_vnode(coveredNode);
7705err2:
7706	rw_lock_write_lock(&sMountLock);
7707	sMountsTable->Remove(mount);
7708	rw_lock_write_unlock(&sMountLock);
7709err1:
7710	delete mount;
7711
7712	return status;
7713}
7714
7715
7716static status_t
7717fs_unmount(char* path, dev_t mountID, uint32 flags, bool kernel)
7718{
7719	struct fs_mount* mount;
7720	status_t err;
7721
7722	FUNCTION(("fs_unmount(path '%s', dev %" B_PRId32 ", kernel %d\n", path,
7723		mountID, kernel));
7724
7725	VnodePutter pathVnode;
7726	if (path != NULL) {
7727		err = path_to_vnode(path, true, pathVnode, NULL, kernel);
7728		if (err != B_OK)
7729			return B_ENTRY_NOT_FOUND;
7730	}
7731
7732	RecursiveLocker mountOpLocker(sMountOpLock);
7733	ReadLocker mountLocker(sMountLock);
7734
7735	mount = find_mount(path != NULL ? pathVnode->device : mountID);
7736	if (mount == NULL) {
7737		panic("fs_unmount: find_mount() failed on root vnode @%p of mount\n",
7738			pathVnode.Get());
7739	}
7740
7741	mountLocker.Unlock();
7742
7743	if (path != NULL) {
7744		if (mount->root_vnode != pathVnode.Get()) {
7745			// not mountpoint
7746			return B_BAD_VALUE;
7747		}
7748
7749		pathVnode.Unset();
7750	}
7751
7752	// if the volume is associated with a partition, lock the device of the
7753	// partition as long as we are unmounting
7754	KDiskDeviceManager* ddm = KDiskDeviceManager::Default();
7755	KPartition* partition = mount->partition;
7756	KDiskDevice* diskDevice = NULL;
7757	if (partition != NULL) {
7758		if (partition->Device() == NULL) {
7759			dprintf("fs_unmount(): There is no device!\n");
7760			return B_ERROR;
7761		}
7762		diskDevice = ddm->WriteLockDevice(partition->Device()->ID());
7763		if (!diskDevice) {
7764			TRACE(("fs_unmount(): Failed to lock disk device!\n"));
7765			return B_ERROR;
7766		}
7767	}
7768	DeviceWriteLocker writeLocker(diskDevice, true);
7769
7770	// make sure, that the partition is not busy
7771	if (partition != NULL) {
7772		if ((flags & B_UNMOUNT_BUSY_PARTITION) == 0 && partition->IsBusy()) {
7773			dprintf("fs_unmount(): Partition is busy.\n");
7774			return B_BUSY;
7775		}
7776	}
7777
7778	// grab the vnode master mutex to keep someone from creating
7779	// a vnode while we're figuring out if we can continue
7780	WriteLocker vnodesWriteLocker(&sVnodeLock);
7781
7782	bool disconnectedDescriptors = false;
7783
7784	while (true) {
7785		bool busy = false;
7786
7787		// cycle through the list of vnodes associated with this mount and
7788		// make sure all of them are not busy or have refs on them
7789		VnodeList::Iterator iterator = mount->vnodes.GetIterator();
7790		while (struct vnode* vnode = iterator.Next()) {
7791			if (vnode->IsBusy()) {
7792				dprintf("fs_unmount(): inode %" B_PRIdINO " is busy\n", vnode->id);
7793				busy = true;
7794				break;
7795			}
7796
7797			// check the vnode's ref count -- subtract additional references for
7798			// covering
7799			int32 refCount = vnode->ref_count;
7800			if (vnode->covers != NULL)
7801				refCount--;
7802			if (vnode->covered_by != NULL)
7803				refCount--;
7804
7805			if (refCount != 0) {
7806				dprintf("fs_unmount(): inode %" B_PRIdINO " is still referenced\n", vnode->id);
7807				// there are still vnodes in use on this mount, so we cannot
7808				// unmount yet
7809				busy = true;
7810				break;
7811			}
7812		}
7813
7814		if (!busy)
7815			break;
7816
7817		if ((flags & B_FORCE_UNMOUNT) == 0)
7818			return B_BUSY;
7819
7820		if (disconnectedDescriptors) {
7821			// wait a bit until the last access is finished, and then try again
7822			vnodesWriteLocker.Unlock();
7823			snooze(100000);
7824			// TODO: if there is some kind of bug that prevents the ref counts
7825			// from getting back to zero, this will fall into an endless loop...
7826			vnodesWriteLocker.Lock();
7827			continue;
7828		}
7829
7830		// the file system is still busy - but we're forced to unmount it,
7831		// so let's disconnect all open file descriptors
7832
7833		mount->unmounting = true;
7834			// prevent new vnodes from being created
7835
7836		vnodesWriteLocker.Unlock();
7837
7838		disconnect_mount_or_vnode_fds(mount, NULL);
7839		disconnectedDescriptors = true;
7840
7841		vnodesWriteLocker.Lock();
7842	}
7843
7844	// We can safely continue. Mark all of the vnodes busy and this mount
7845	// structure in unmounting state. Also undo the vnode covers/covered_by
7846	// links.
7847	mount->unmounting = true;
7848
7849	VnodeList::Iterator iterator = mount->vnodes.GetIterator();
7850	while (struct vnode* vnode = iterator.Next()) {
7851		// Remove all covers/covered_by links from other mounts' nodes to this
7852		// vnode and adjust the node ref count accordingly. We will release the
7853		// references to the external vnodes below.
7854		if (Vnode* coveredNode = vnode->covers) {
7855			if (Vnode* coveringNode = vnode->covered_by) {
7856				// We have both covered and covering vnodes, so just remove us
7857				// from the chain.
7858				coveredNode->covered_by = coveringNode;
7859				coveringNode->covers = coveredNode;
7860				vnode->ref_count -= 2;
7861
7862				vnode->covered_by = NULL;
7863				vnode->covers = NULL;
7864				vnode->SetCovering(false);
7865				vnode->SetCovered(false);
7866			} else {
7867				// We only have a covered vnode. Remove its link to us.
7868				coveredNode->covered_by = NULL;
7869				coveredNode->SetCovered(false);
7870				vnode->ref_count--;
7871
7872				// If the other node is an external vnode, we keep its link
7873				// link around so we can put the reference later on. Otherwise
7874				// we get rid of it right now.
7875				if (coveredNode->mount == mount) {
7876					vnode->covers = NULL;
7877					coveredNode->ref_count--;
7878				}
7879			}
7880		} else if (Vnode* coveringNode = vnode->covered_by) {
7881			// We only have a covering vnode. Remove its link to us.
7882			coveringNode->covers = NULL;
7883			coveringNode->SetCovering(false);
7884			vnode->ref_count--;
7885
7886			// If the other node is an external vnode, we keep its link
7887			// link around so we can put the reference later on. Otherwise
7888			// we get rid of it right now.
7889			if (coveringNode->mount == mount) {
7890				vnode->covered_by = NULL;
7891				coveringNode->ref_count--;
7892			}
7893		}
7894
7895		vnode->SetBusy(true);
7896		vnode_to_be_freed(vnode);
7897	}
7898
7899	vnodesWriteLocker.Unlock();
7900
7901	// Free all vnodes associated with this mount.
7902	// They will be removed from the mount list by free_vnode(), so
7903	// we don't have to do this.
7904	while (struct vnode* vnode = mount->vnodes.Head()) {
7905		// Put the references to external covered/covering vnodes we kept above.
7906		if (Vnode* coveredNode = vnode->covers)
7907			put_vnode(coveredNode);
7908		if (Vnode* coveringNode = vnode->covered_by)
7909			put_vnode(coveringNode);
7910
7911		free_vnode(vnode, false);
7912	}
7913
7914	// remove the mount structure from the hash table
7915	rw_lock_write_lock(&sMountLock);
7916	sMountsTable->Remove(mount);
7917	rw_lock_write_unlock(&sMountLock);
7918
7919	mountOpLocker.Unlock();
7920
7921	FS_MOUNT_CALL_NO_PARAMS(mount, unmount);
7922	notify_unmount(mount->id);
7923
7924	// dereference the partition and mark it unmounted
7925	if (partition) {
7926		partition->SetVolumeID(-1);
7927		partition->SetMountCookie(NULL);
7928
7929		if (mount->owns_file_device)
7930			KDiskDeviceManager::Default()->DeleteFileDevice(partition->ID());
7931		partition->Unregister();
7932	}
7933
7934	delete mount;
7935	return B_OK;
7936}
7937
7938
7939static status_t
7940fs_sync(dev_t device)
7941{
7942	struct fs_mount* mount;
7943	status_t status = get_mount(device, &mount);
7944	if (status != B_OK)
7945		return status;
7946
7947	struct vnode marker;
7948	memset(&marker, 0, sizeof(marker));
7949	marker.SetBusy(true);
7950	marker.SetRemoved(true);
7951
7952	// First, synchronize all file caches
7953
7954	while (true) {
7955		WriteLocker locker(sVnodeLock);
7956			// Note: That's the easy way. Which is probably OK for sync(),
7957			// since it's a relatively rare call and doesn't need to allow for
7958			// a lot of concurrency. Using a read lock would be possible, but
7959			// also more involved, since we had to lock the individual nodes
7960			// and take care of the locking order, which we might not want to
7961			// do while holding fs_mount::lock.
7962
7963		// synchronize access to vnode list
7964		mutex_lock(&mount->lock);
7965
7966		struct vnode* vnode;
7967		if (!marker.IsRemoved()) {
7968			vnode = mount->vnodes.GetNext(&marker);
7969			mount->vnodes.Remove(&marker);
7970			marker.SetRemoved(true);
7971		} else
7972			vnode = mount->vnodes.First();
7973
7974		while (vnode != NULL && (vnode->cache == NULL
7975			|| vnode->IsRemoved() || vnode->IsBusy())) {
7976			// TODO: we could track writes (and writable mapped vnodes)
7977			//	and have a simple flag that we could test for here
7978			vnode = mount->vnodes.GetNext(vnode);
7979		}
7980
7981		if (vnode != NULL) {
7982			// insert marker vnode again
7983			mount->vnodes.InsertBefore(mount->vnodes.GetNext(vnode), &marker);
7984			marker.SetRemoved(false);
7985		}
7986
7987		mutex_unlock(&mount->lock);
7988
7989		if (vnode == NULL)
7990			break;
7991
7992		vnode = lookup_vnode(mount->id, vnode->id);
7993		if (vnode == NULL || vnode->IsBusy())
7994			continue;
7995
7996		if (vnode->ref_count == 0) {
7997			// this vnode has been unused before
7998			vnode_used(vnode);
7999		}
8000		inc_vnode_ref_count(vnode);
8001
8002		locker.Unlock();
8003
8004		if (vnode->cache != NULL && !vnode->IsRemoved())
8005			vnode->cache->WriteModified();
8006
8007		put_vnode(vnode);
8008	}
8009
8010	// Let the file systems do their synchronizing work
8011	if (HAS_FS_MOUNT_CALL(mount, sync))
8012		status = FS_MOUNT_CALL_NO_PARAMS(mount, sync);
8013
8014	// Finally, flush the underlying device's write cache (if possible.)
8015	if (mount->partition != NULL && mount->partition->Device() != NULL)
8016		ioctl(mount->partition->Device()->FD(), B_FLUSH_DRIVE_CACHE);
8017
8018	put_mount(mount);
8019	return status;
8020}
8021
8022
8023static status_t
8024fs_read_info(dev_t device, struct fs_info* info)
8025{
8026	struct fs_mount* mount;
8027	status_t status = get_mount(device, &mount);
8028	if (status != B_OK)
8029		return status;
8030
8031	memset(info, 0, sizeof(struct fs_info));
8032
8033	if (HAS_FS_MOUNT_CALL(mount, read_fs_info))
8034		status = FS_MOUNT_CALL(mount, read_fs_info, info);
8035
8036	// fill in info the file system doesn't (have to) know about
8037	if (status == B_OK) {
8038		info->dev = mount->id;
8039		info->root = mount->root_vnode->id;
8040
8041		fs_volume* volume = mount->volume;
8042		while (volume->super_volume != NULL)
8043			volume = volume->super_volume;
8044
8045		strlcpy(info->fsh_name, volume->file_system_name,
8046			sizeof(info->fsh_name));
8047		if (mount->device_name != NULL) {
8048			strlcpy(info->device_name, mount->device_name,
8049				sizeof(info->device_name));
8050		}
8051	}
8052
8053	// if the call is not supported by the file system, there are still
8054	// the parts that we filled out ourselves
8055
8056	put_mount(mount);
8057	return status;
8058}
8059
8060
8061static status_t
8062fs_write_info(dev_t device, const struct fs_info* info, int mask)
8063{
8064	struct fs_mount* mount;
8065	status_t status = get_mount(device, &mount);
8066	if (status != B_OK)
8067		return status;
8068
8069	if (HAS_FS_MOUNT_CALL(mount, write_fs_info))
8070		status = FS_MOUNT_CALL(mount, write_fs_info, info, mask);
8071	else
8072		status = B_READ_ONLY_DEVICE;
8073
8074	put_mount(mount);
8075	return status;
8076}
8077
8078
8079static dev_t
8080fs_next_device(int32* _cookie)
8081{
8082	struct fs_mount* mount = NULL;
8083	dev_t device = *_cookie;
8084
8085	rw_lock_read_lock(&sMountLock);
8086
8087	// Since device IDs are assigned sequentially, this algorithm
8088	// does work good enough. It makes sure that the device list
8089	// returned is sorted, and that no device is skipped when an
8090	// already visited device got unmounted.
8091
8092	while (device < sNextMountID) {
8093		mount = find_mount(device++);
8094		if (mount != NULL && mount->volume->private_volume != NULL)
8095			break;
8096	}
8097
8098	*_cookie = device;
8099
8100	if (mount != NULL)
8101		device = mount->id;
8102	else
8103		device = B_BAD_VALUE;
8104
8105	rw_lock_read_unlock(&sMountLock);
8106
8107	return device;
8108}
8109
8110
8111ssize_t
8112fs_read_attr(int fd, const char *attribute, uint32 type, off_t pos,
8113	void *buffer, size_t readBytes)
8114{
8115	int attrFD = attr_open(fd, NULL, attribute, O_RDONLY, true);
8116	if (attrFD < 0)
8117		return attrFD;
8118
8119	ssize_t bytesRead = _kern_read(attrFD, pos, buffer, readBytes);
8120
8121	_kern_close(attrFD);
8122
8123	return bytesRead;
8124}
8125
8126
8127static status_t
8128get_cwd(char* buffer, size_t size, bool kernel)
8129{
8130	// Get current working directory from io context
8131	struct io_context* context = get_current_io_context(kernel);
8132	status_t status;
8133
8134	FUNCTION(("vfs_get_cwd: buf %p, size %ld\n", buffer, size));
8135
8136	mutex_lock(&context->io_mutex);
8137
8138	struct vnode* vnode = context->cwd;
8139	if (vnode)
8140		inc_vnode_ref_count(vnode);
8141
8142	mutex_unlock(&context->io_mutex);
8143
8144	if (vnode) {
8145		status = dir_vnode_to_path(vnode, buffer, size, kernel);
8146		put_vnode(vnode);
8147	} else
8148		status = B_ERROR;
8149
8150	return status;
8151}
8152
8153
8154static status_t
8155set_cwd(int fd, char* path, bool kernel)
8156{
8157	struct io_context* context;
8158	struct vnode* oldDirectory;
8159
8160	FUNCTION(("set_cwd: path = \'%s\'\n", path));
8161
8162	// Get vnode for passed path, and bail if it failed
8163	VnodePutter vnode;
8164	status_t status = fd_and_path_to_vnode(fd, path, true, vnode, NULL, kernel);
8165	if (status < 0)
8166		return status;
8167
8168	if (!S_ISDIR(vnode->Type())) {
8169		// nope, can't cwd to here
8170		return B_NOT_A_DIRECTORY;
8171	}
8172
8173	// We need to have the permission to enter the directory, too
8174	if (HAS_FS_CALL(vnode, access)) {
8175		status = FS_CALL(vnode.Get(), access, X_OK);
8176		if (status != B_OK)
8177			return status;
8178	}
8179
8180	// Get current io context and lock
8181	context = get_current_io_context(kernel);
8182	mutex_lock(&context->io_mutex);
8183
8184	// save the old current working directory first
8185	oldDirectory = context->cwd;
8186	context->cwd = vnode.Detach();
8187
8188	mutex_unlock(&context->io_mutex);
8189
8190	if (oldDirectory)
8191		put_vnode(oldDirectory);
8192
8193	return B_NO_ERROR;
8194}
8195
8196
8197static status_t
8198user_copy_name(char* to, const char* from, size_t length)
8199{
8200	ssize_t len = user_strlcpy(to, from, length);
8201	if (len < 0)
8202		return len;
8203	if (len >= (ssize_t)length)
8204		return B_NAME_TOO_LONG;
8205	return B_OK;
8206}
8207
8208
8209//	#pragma mark - kernel mirrored syscalls
8210
8211
8212dev_t
8213_kern_mount(const char* path, const char* device, const char* fsName,
8214	uint32 flags, const char* args, size_t argsLength)
8215{
8216	KPath pathBuffer(path);
8217	if (pathBuffer.InitCheck() != B_OK)
8218		return B_NO_MEMORY;
8219
8220	return fs_mount(pathBuffer.LockBuffer(), device, fsName, flags, args, true);
8221}
8222
8223
8224status_t
8225_kern_unmount(const char* path, uint32 flags)
8226{
8227	KPath pathBuffer(path);
8228	if (pathBuffer.InitCheck() != B_OK)
8229		return B_NO_MEMORY;
8230
8231	return fs_unmount(pathBuffer.LockBuffer(), -1, flags, true);
8232}
8233
8234
8235status_t
8236_kern_read_fs_info(dev_t device, struct fs_info* info)
8237{
8238	if (info == NULL)
8239		return B_BAD_VALUE;
8240
8241	return fs_read_info(device, info);
8242}
8243
8244
8245status_t
8246_kern_write_fs_info(dev_t device, const struct fs_info* info, int mask)
8247{
8248	if (info == NULL)
8249		return B_BAD_VALUE;
8250
8251	return fs_write_info(device, info, mask);
8252}
8253
8254
8255status_t
8256_kern_sync(void)
8257{
8258	// Note: _kern_sync() is also called from _user_sync()
8259	int32 cookie = 0;
8260	dev_t device;
8261	while ((device = next_dev(&cookie)) >= 0) {
8262		status_t status = fs_sync(device);
8263		if (status != B_OK && status != B_BAD_VALUE) {
8264			dprintf("sync: device %" B_PRIdDEV " couldn't sync: %s\n", device,
8265				strerror(status));
8266		}
8267	}
8268
8269	return B_OK;
8270}
8271
8272
8273dev_t
8274_kern_next_device(int32* _cookie)
8275{
8276	return fs_next_device(_cookie);
8277}
8278
8279
8280status_t
8281_kern_get_next_fd_info(team_id teamID, uint32* _cookie, fd_info* info,
8282	size_t infoSize)
8283{
8284	if (infoSize != sizeof(fd_info))
8285		return B_BAD_VALUE;
8286
8287	// get the team
8288	Team* team = Team::Get(teamID);
8289	if (team == NULL)
8290		return B_BAD_TEAM_ID;
8291	BReference<Team> teamReference(team, true);
8292
8293	// now that we have a team reference, its I/O context won't go away
8294	io_context* context = team->io_context;
8295	MutexLocker contextLocker(context->io_mutex);
8296
8297	uint32 slot = *_cookie;
8298
8299	struct file_descriptor* descriptor;
8300	while (slot < context->table_size
8301		&& (descriptor = context->fds[slot]) == NULL) {
8302		slot++;
8303	}
8304
8305	if (slot >= context->table_size)
8306		return B_ENTRY_NOT_FOUND;
8307
8308	info->number = slot;
8309	info->open_mode = descriptor->open_mode;
8310
8311	struct vnode* vnode = fd_vnode(descriptor);
8312	if (vnode != NULL) {
8313		info->device = vnode->device;
8314		info->node = vnode->id;
8315	} else if (descriptor->u.mount != NULL) {
8316		info->device = descriptor->u.mount->id;
8317		info->node = -1;
8318	}
8319
8320	*_cookie = slot + 1;
8321	return B_OK;
8322}
8323
8324
8325int
8326_kern_open_entry_ref(dev_t device, ino_t inode, const char* name, int openMode,
8327	int perms)
8328{
8329	if ((openMode & O_CREAT) != 0) {
8330		return file_create_entry_ref(device, inode, name, openMode, perms,
8331			true);
8332	}
8333
8334	return file_open_entry_ref(device, inode, name, openMode, true);
8335}
8336
8337
8338/*!	\brief Opens a node specified by a FD + path pair.
8339
8340	At least one of \a fd and \a path must be specified.
8341	If only \a fd is given, the function opens the node identified by this
8342	FD. If only a path is given, this path is opened. If both are given and
8343	the path is absolute, \a fd is ignored; a relative path is reckoned off
8344	of the directory (!) identified by \a fd.
8345
8346	\param fd The FD. May be < 0.
8347	\param path The absolute or relative path. May be \c NULL.
8348	\param openMode The open mode.
8349	\return A FD referring to the newly opened node, or an error code,
8350			if an error occurs.
8351*/
8352int
8353_kern_open(int fd, const char* path, int openMode, int perms)
8354{
8355	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8356	if (pathBuffer.InitCheck() != B_OK)
8357		return B_NO_MEMORY;
8358
8359	if ((openMode & O_CREAT) != 0)
8360		return file_create(fd, pathBuffer.LockBuffer(), openMode, perms, true);
8361
8362	return file_open(fd, pathBuffer.LockBuffer(), openMode, true);
8363}
8364
8365
8366/*!	\brief Opens a directory specified by entry_ref or node_ref.
8367
8368	The supplied name may be \c NULL, in which case directory identified
8369	by \a device and \a inode will be opened. Otherwise \a device and
8370	\a inode identify the parent directory of the directory to be opened
8371	and \a name its entry name.
8372
8373	\param device If \a name is specified the ID of the device the parent
8374		   directory of the directory to be opened resides on, otherwise
8375		   the device of the directory itself.
8376	\param inode If \a name is specified the node ID of the parent
8377		   directory of the directory to be opened, otherwise node ID of the
8378		   directory itself.
8379	\param name The entry name of the directory to be opened. If \c NULL,
8380		   the \a device + \a inode pair identify the node to be opened.
8381	\return The FD of the newly opened directory or an error code, if
8382			something went wrong.
8383*/
8384int
8385_kern_open_dir_entry_ref(dev_t device, ino_t inode, const char* name)
8386{
8387	return dir_open_entry_ref(device, inode, name, true);
8388}
8389
8390
8391/*!	\brief Opens a directory specified by a FD + path pair.
8392
8393	At least one of \a fd and \a path must be specified.
8394	If only \a fd is given, the function opens the directory identified by this
8395	FD. If only a path is given, this path is opened. If both are given and
8396	the path is absolute, \a fd is ignored; a relative path is reckoned off
8397	of the directory (!) identified by \a fd.
8398
8399	\param fd The FD. May be < 0.
8400	\param path The absolute or relative path. May be \c NULL.
8401	\return A FD referring to the newly opened directory, or an error code,
8402			if an error occurs.
8403*/
8404int
8405_kern_open_dir(int fd, const char* path)
8406{
8407	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8408	if (pathBuffer.InitCheck() != B_OK)
8409		return B_NO_MEMORY;
8410
8411	return dir_open(fd, pathBuffer.LockBuffer(), true);
8412}
8413
8414
8415status_t
8416_kern_fcntl(int fd, int op, size_t argument)
8417{
8418	return common_fcntl(fd, op, argument, true);
8419}
8420
8421
8422status_t
8423_kern_fsync(int fd)
8424{
8425	return common_sync(fd, true);
8426}
8427
8428
8429status_t
8430_kern_lock_node(int fd)
8431{
8432	return common_lock_node(fd, true);
8433}
8434
8435
8436status_t
8437_kern_unlock_node(int fd)
8438{
8439	return common_unlock_node(fd, true);
8440}
8441
8442
8443status_t
8444_kern_preallocate(int fd, off_t offset, off_t length)
8445{
8446	return common_preallocate(fd, offset, length, true);
8447}
8448
8449
8450status_t
8451_kern_create_dir_entry_ref(dev_t device, ino_t inode, const char* name,
8452	int perms)
8453{
8454	return dir_create_entry_ref(device, inode, name, perms, true);
8455}
8456
8457
8458/*!	\brief Creates a directory specified by a FD + path pair.
8459
8460	\a path must always be specified (it contains the name of the new directory
8461	at least). If only a path is given, this path identifies the location at
8462	which the directory shall be created. If both \a fd and \a path are given
8463	and the path is absolute, \a fd is ignored; a relative path is reckoned off
8464	of the directory (!) identified by \a fd.
8465
8466	\param fd The FD. May be < 0.
8467	\param path The absolute or relative path. Must not be \c NULL.
8468	\param perms The access permissions the new directory shall have.
8469	\return \c B_OK, if the directory has been created successfully, another
8470			error code otherwise.
8471*/
8472status_t
8473_kern_create_dir(int fd, const char* path, int perms)
8474{
8475	KPath pathBuffer(path, KPath::DEFAULT);
8476	if (pathBuffer.InitCheck() != B_OK)
8477		return B_NO_MEMORY;
8478
8479	return dir_create(fd, pathBuffer.LockBuffer(), perms, true);
8480}
8481
8482
8483status_t
8484_kern_remove_dir(int fd, const char* path)
8485{
8486	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8487	if (pathBuffer.InitCheck() != B_OK)
8488		return B_NO_MEMORY;
8489
8490	return dir_remove(fd, pathBuffer.LockBuffer(), true);
8491}
8492
8493
8494/*!	\brief Reads the contents of a symlink referred to by a FD + path pair.
8495
8496	At least one of \a fd and \a path must be specified.
8497	If only \a fd is given, the function the symlink to be read is the node
8498	identified by this FD. If only a path is given, this path identifies the
8499	symlink to be read. If both are given and the path is absolute, \a fd is
8500	ignored; a relative path is reckoned off of the directory (!) identified
8501	by \a fd.
8502	If this function fails with B_BUFFER_OVERFLOW, the \a _bufferSize pointer
8503	will still be updated to reflect the required buffer size.
8504
8505	\param fd The FD. May be < 0.
8506	\param path The absolute or relative path. May be \c NULL.
8507	\param buffer The buffer into which the contents of the symlink shall be
8508		   written.
8509	\param _bufferSize A pointer to the size of the supplied buffer.
8510	\return The length of the link on success or an appropriate error code
8511*/
8512status_t
8513_kern_read_link(int fd, const char* path, char* buffer, size_t* _bufferSize)
8514{
8515	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8516	if (pathBuffer.InitCheck() != B_OK)
8517		return B_NO_MEMORY;
8518
8519	return common_read_link(fd, pathBuffer.LockBuffer(),
8520		buffer, _bufferSize, true);
8521}
8522
8523
8524/*!	\brief Creates a symlink specified by a FD + path pair.
8525
8526	\a path must always be specified (it contains the name of the new symlink
8527	at least). If only a path is given, this path identifies the location at
8528	which the symlink shall be created. If both \a fd and \a path are given and
8529	the path is absolute, \a fd is ignored; a relative path is reckoned off
8530	of the directory (!) identified by \a fd.
8531
8532	\param fd The FD. May be < 0.
8533	\param toPath The absolute or relative path. Must not be \c NULL.
8534	\param mode The access permissions the new symlink shall have.
8535	\return \c B_OK, if the symlink has been created successfully, another
8536			error code otherwise.
8537*/
8538status_t
8539_kern_create_symlink(int fd, const char* path, const char* toPath, int mode)
8540{
8541	KPath pathBuffer(path);
8542	if (pathBuffer.InitCheck() != B_OK)
8543		return B_NO_MEMORY;
8544
8545	return common_create_symlink(fd, pathBuffer.LockBuffer(),
8546		toPath, mode, true);
8547}
8548
8549
8550status_t
8551_kern_create_link(int pathFD, const char* path, int toFD, const char* toPath,
8552	bool traverseLeafLink)
8553{
8554	KPath pathBuffer(path);
8555	KPath toPathBuffer(toPath);
8556	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
8557		return B_NO_MEMORY;
8558
8559	return common_create_link(pathFD, pathBuffer.LockBuffer(), toFD,
8560		toPathBuffer.LockBuffer(), traverseLeafLink, true);
8561}
8562
8563
8564/*!	\brief Removes an entry specified by a FD + path pair from its directory.
8565
8566	\a path must always be specified (it contains at least the name of the entry
8567	to be deleted). If only a path is given, this path identifies the entry
8568	directly. If both \a fd and \a path are given and the path is absolute,
8569	\a fd is ignored; a relative path is reckoned off of the directory (!)
8570	identified by \a fd.
8571
8572	\param fd The FD. May be < 0.
8573	\param path The absolute or relative path. Must not be \c NULL.
8574	\return \c B_OK, if the entry has been removed successfully, another
8575			error code otherwise.
8576*/
8577status_t
8578_kern_unlink(int fd, const char* path)
8579{
8580	KPath pathBuffer(path);
8581	if (pathBuffer.InitCheck() != B_OK)
8582		return B_NO_MEMORY;
8583
8584	return common_unlink(fd, pathBuffer.LockBuffer(), true);
8585}
8586
8587
8588/*!	\brief Moves an entry specified by a FD + path pair to a an entry specified
8589		   by another FD + path pair.
8590
8591	\a oldPath and \a newPath must always be specified (they contain at least
8592	the name of the entry). If only a path is given, this path identifies the
8593	entry directly. If both a FD and a path are given and the path is absolute,
8594	the FD is ignored; a relative path is reckoned off of the directory (!)
8595	identified by the respective FD.
8596
8597	\param oldFD The FD of the old location. May be < 0.
8598	\param oldPath The absolute or relative path of the old location. Must not
8599		   be \c NULL.
8600	\param newFD The FD of the new location. May be < 0.
8601	\param newPath The absolute or relative path of the new location. Must not
8602		   be \c NULL.
8603	\return \c B_OK, if the entry has been moved successfully, another
8604			error code otherwise.
8605*/
8606status_t
8607_kern_rename(int oldFD, const char* oldPath, int newFD, const char* newPath)
8608{
8609	KPath oldPathBuffer(oldPath);
8610	KPath newPathBuffer(newPath);
8611	if (oldPathBuffer.InitCheck() != B_OK || newPathBuffer.InitCheck() != B_OK)
8612		return B_NO_MEMORY;
8613
8614	return common_rename(oldFD, oldPathBuffer.LockBuffer(),
8615		newFD, newPathBuffer.LockBuffer(), true);
8616}
8617
8618
8619status_t
8620_kern_access(int fd, const char* path, int mode, bool effectiveUserGroup)
8621{
8622	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8623	if (pathBuffer.InitCheck() != B_OK)
8624		return B_NO_MEMORY;
8625
8626	return common_access(fd, pathBuffer.LockBuffer(), mode, effectiveUserGroup,
8627		true);
8628}
8629
8630
8631/*!	\brief Reads stat data of an entity specified by a FD + path pair.
8632
8633	If only \a fd is given, the stat operation associated with the type
8634	of the FD (node, attr, attr dir etc.) is performed. If only \a path is
8635	given, this path identifies the entry for whose node to retrieve the
8636	stat data. If both \a fd and \a path are given and the path is absolute,
8637	\a fd is ignored; a relative path is reckoned off of the directory (!)
8638	identified by \a fd and specifies the entry whose stat data shall be
8639	retrieved.
8640
8641	\param fd The FD. May be < 0.
8642	\param path The absolute or relative path. Must not be \c NULL.
8643	\param traverseLeafLink If \a path is given, \c true specifies that the
8644		   function shall not stick to symlinks, but traverse them.
8645	\param stat The buffer the stat data shall be written into.
8646	\param statSize The size of the supplied stat buffer.
8647	\return \c B_OK, if the the stat data have been read successfully, another
8648			error code otherwise.
8649*/
8650status_t
8651_kern_read_stat(int fd, const char* path, bool traverseLeafLink,
8652	struct stat* stat, size_t statSize)
8653{
8654	struct stat completeStat;
8655	struct stat* originalStat = NULL;
8656	status_t status;
8657
8658	if (statSize > sizeof(struct stat))
8659		return B_BAD_VALUE;
8660
8661	// this supports different stat extensions
8662	if (statSize < sizeof(struct stat)) {
8663		originalStat = stat;
8664		stat = &completeStat;
8665	}
8666
8667	status = vfs_read_stat(fd, path, traverseLeafLink, stat, true);
8668
8669	if (status == B_OK && originalStat != NULL)
8670		memcpy(originalStat, stat, statSize);
8671
8672	return status;
8673}
8674
8675
8676/*!	\brief Writes stat data of an entity specified by a FD + path pair.
8677
8678	If only \a fd is given, the stat operation associated with the type
8679	of the FD (node, attr, attr dir etc.) is performed. If only \a path is
8680	given, this path identifies the entry for whose node to write the
8681	stat data. If both \a fd and \a path are given and the path is absolute,
8682	\a fd is ignored; a relative path is reckoned off of the directory (!)
8683	identified by \a fd and specifies the entry whose stat data shall be
8684	written.
8685
8686	\param fd The FD. May be < 0.
8687	\param path The absolute or relative path. May be \c NULL.
8688	\param traverseLeafLink If \a path is given, \c true specifies that the
8689		   function shall not stick to symlinks, but traverse them.
8690	\param stat The buffer containing the stat data to be written.
8691	\param statSize The size of the supplied stat buffer.
8692	\param statMask A mask specifying which parts of the stat data shall be
8693		   written.
8694	\return \c B_OK, if the the stat data have been written successfully,
8695			another error code otherwise.
8696*/
8697status_t
8698_kern_write_stat(int fd, const char* path, bool traverseLeafLink,
8699	const struct stat* stat, size_t statSize, int statMask)
8700{
8701	struct stat completeStat;
8702
8703	if (statSize > sizeof(struct stat))
8704		return B_BAD_VALUE;
8705
8706	// this supports different stat extensions
8707	if (statSize < sizeof(struct stat)) {
8708		memset((uint8*)&completeStat + statSize, 0,
8709			sizeof(struct stat) - statSize);
8710		memcpy(&completeStat, stat, statSize);
8711		stat = &completeStat;
8712	}
8713
8714	status_t status;
8715
8716	if (path != NULL) {
8717		// path given: write the stat of the node referred to by (fd, path)
8718		KPath pathBuffer(path);
8719		if (pathBuffer.InitCheck() != B_OK)
8720			return B_NO_MEMORY;
8721
8722		status = common_path_write_stat(fd, pathBuffer.LockBuffer(),
8723			traverseLeafLink, stat, statMask, true);
8724	} else {
8725		// no path given: get the FD and use the FD operation
8726		FileDescriptorPutter descriptor
8727			(get_fd(get_current_io_context(true), fd));
8728		if (!descriptor.IsSet())
8729			return B_FILE_ERROR;
8730
8731		if (descriptor->ops->fd_write_stat)
8732			status = descriptor->ops->fd_write_stat(descriptor.Get(), stat, statMask);
8733		else
8734			status = B_UNSUPPORTED;
8735	}
8736
8737	return status;
8738}
8739
8740
8741int
8742_kern_open_attr_dir(int fd, const char* path, bool traverseLeafLink)
8743{
8744	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8745	if (pathBuffer.InitCheck() != B_OK)
8746		return B_NO_MEMORY;
8747
8748	return attr_dir_open(fd, pathBuffer.LockBuffer(), traverseLeafLink, true);
8749}
8750
8751
8752int
8753_kern_open_attr(int fd, const char* path, const char* name, uint32 type,
8754	int openMode)
8755{
8756	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8757	if (pathBuffer.InitCheck() != B_OK)
8758		return B_NO_MEMORY;
8759
8760	if ((openMode & O_CREAT) != 0) {
8761		return attr_create(fd, pathBuffer.LockBuffer(), name, type, openMode,
8762			true);
8763	}
8764
8765	return attr_open(fd, pathBuffer.LockBuffer(), name, openMode, true);
8766}
8767
8768
8769status_t
8770_kern_remove_attr(int fd, const char* name)
8771{
8772	return attr_remove(fd, name, true);
8773}
8774
8775
8776status_t
8777_kern_rename_attr(int fromFile, const char* fromName, int toFile,
8778	const char* toName)
8779{
8780	return attr_rename(fromFile, fromName, toFile, toName, true);
8781}
8782
8783
8784int
8785_kern_open_index_dir(dev_t device)
8786{
8787	return index_dir_open(device, true);
8788}
8789
8790
8791status_t
8792_kern_create_index(dev_t device, const char* name, uint32 type, uint32 flags)
8793{
8794	return index_create(device, name, type, flags, true);
8795}
8796
8797
8798status_t
8799_kern_read_index_stat(dev_t device, const char* name, struct stat* stat)
8800{
8801	return index_name_read_stat(device, name, stat, true);
8802}
8803
8804
8805status_t
8806_kern_remove_index(dev_t device, const char* name)
8807{
8808	return index_remove(device, name, true);
8809}
8810
8811
8812status_t
8813_kern_getcwd(char* buffer, size_t size)
8814{
8815	TRACE(("_kern_getcwd: buf %p, %ld\n", buffer, size));
8816
8817	// Call vfs to get current working directory
8818	return get_cwd(buffer, size, true);
8819}
8820
8821
8822status_t
8823_kern_setcwd(int fd, const char* path)
8824{
8825	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8826	if (pathBuffer.InitCheck() != B_OK)
8827		return B_NO_MEMORY;
8828
8829	return set_cwd(fd, pathBuffer.LockBuffer(), true);
8830}
8831
8832
8833//	#pragma mark - userland syscalls
8834
8835
8836dev_t
8837_user_mount(const char* userPath, const char* userDevice,
8838	const char* userFileSystem, uint32 flags, const char* userArgs,
8839	size_t argsLength)
8840{
8841	char fileSystem[B_FILE_NAME_LENGTH];
8842	KPath path, device;
8843	char* args = NULL;
8844	status_t status;
8845
8846	if (!IS_USER_ADDRESS(userPath))
8847		return B_BAD_ADDRESS;
8848
8849	if (path.InitCheck() != B_OK || device.InitCheck() != B_OK)
8850		return B_NO_MEMORY;
8851
8852	status = user_copy_name(path.LockBuffer(), userPath,
8853		B_PATH_NAME_LENGTH);
8854	if (status != B_OK)
8855		return status;
8856	path.UnlockBuffer();
8857
8858	if (userFileSystem != NULL) {
8859		if (!IS_USER_ADDRESS(userFileSystem))
8860			return B_BAD_ADDRESS;
8861
8862		status = user_copy_name(fileSystem, userFileSystem, sizeof(fileSystem));
8863		if (status != B_OK)
8864			return status;
8865	}
8866
8867	if (userDevice != NULL) {
8868		if (!IS_USER_ADDRESS(userDevice))
8869			return B_BAD_ADDRESS;
8870
8871		status = user_copy_name(device.LockBuffer(), userDevice,
8872			B_PATH_NAME_LENGTH);
8873		if (status != B_OK)
8874			return status;
8875		device.UnlockBuffer();
8876	}
8877
8878	if (userArgs != NULL && argsLength > 0) {
8879		if (!IS_USER_ADDRESS(userArgs))
8880			return B_BAD_ADDRESS;
8881
8882		// this is a safety restriction
8883		if (argsLength >= 65536)
8884			return B_NAME_TOO_LONG;
8885
8886		args = (char*)malloc(argsLength + 1);
8887		if (args == NULL)
8888			return B_NO_MEMORY;
8889
8890		status = user_copy_name(args, userArgs, argsLength + 1);
8891		if (status != B_OK) {
8892			free(args);
8893			return status;
8894		}
8895	}
8896
8897	status = fs_mount(path.LockBuffer(),
8898		userDevice != NULL ? device.Path() : NULL,
8899		userFileSystem ? fileSystem : NULL, flags, args, false);
8900
8901	free(args);
8902	return status;
8903}
8904
8905
8906status_t
8907_user_unmount(const char* userPath, uint32 flags)
8908{
8909	if (!IS_USER_ADDRESS(userPath))
8910		return B_BAD_ADDRESS;
8911
8912	KPath pathBuffer;
8913	if (pathBuffer.InitCheck() != B_OK)
8914		return B_NO_MEMORY;
8915
8916	char* path = pathBuffer.LockBuffer();
8917
8918	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
8919	if (status != B_OK)
8920		return status;
8921
8922	return fs_unmount(path, -1, flags & ~B_UNMOUNT_BUSY_PARTITION, false);
8923}
8924
8925
8926status_t
8927_user_read_fs_info(dev_t device, struct fs_info* userInfo)
8928{
8929	struct fs_info info;
8930	status_t status;
8931
8932	if (userInfo == NULL)
8933		return B_BAD_VALUE;
8934
8935	if (!IS_USER_ADDRESS(userInfo))
8936		return B_BAD_ADDRESS;
8937
8938	status = fs_read_info(device, &info);
8939	if (status != B_OK)
8940		return status;
8941
8942	if (user_memcpy(userInfo, &info, sizeof(struct fs_info)) != B_OK)
8943		return B_BAD_ADDRESS;
8944
8945	return B_OK;
8946}
8947
8948
8949status_t
8950_user_write_fs_info(dev_t device, const struct fs_info* userInfo, int mask)
8951{
8952	struct fs_info info;
8953
8954	if (userInfo == NULL)
8955		return B_BAD_VALUE;
8956
8957	if (!IS_USER_ADDRESS(userInfo)
8958		|| user_memcpy(&info, userInfo, sizeof(struct fs_info)) != B_OK)
8959		return B_BAD_ADDRESS;
8960
8961	return fs_write_info(device, &info, mask);
8962}
8963
8964
8965dev_t
8966_user_next_device(int32* _userCookie)
8967{
8968	int32 cookie;
8969	dev_t device;
8970
8971	if (!IS_USER_ADDRESS(_userCookie)
8972		|| user_memcpy(&cookie, _userCookie, sizeof(int32)) != B_OK)
8973		return B_BAD_ADDRESS;
8974
8975	device = fs_next_device(&cookie);
8976
8977	if (device >= B_OK) {
8978		// update user cookie
8979		if (user_memcpy(_userCookie, &cookie, sizeof(int32)) != B_OK)
8980			return B_BAD_ADDRESS;
8981	}
8982
8983	return device;
8984}
8985
8986
8987status_t
8988_user_sync(void)
8989{
8990	return _kern_sync();
8991}
8992
8993
8994status_t
8995_user_get_next_fd_info(team_id team, uint32* userCookie, fd_info* userInfo,
8996	size_t infoSize)
8997{
8998	struct fd_info info;
8999	uint32 cookie;
9000
9001	// only root can do this
9002	if (geteuid() != 0)
9003		return B_NOT_ALLOWED;
9004
9005	if (infoSize != sizeof(fd_info))
9006		return B_BAD_VALUE;
9007
9008	if (!IS_USER_ADDRESS(userCookie) || !IS_USER_ADDRESS(userInfo)
9009		|| user_memcpy(&cookie, userCookie, sizeof(uint32)) != B_OK)
9010		return B_BAD_ADDRESS;
9011
9012	status_t status = _kern_get_next_fd_info(team, &cookie, &info, infoSize);
9013	if (status != B_OK)
9014		return status;
9015
9016	if (user_memcpy(userCookie, &cookie, sizeof(uint32)) != B_OK
9017		|| user_memcpy(userInfo, &info, infoSize) != B_OK)
9018		return B_BAD_ADDRESS;
9019
9020	return status;
9021}
9022
9023
9024status_t
9025_user_entry_ref_to_path(dev_t device, ino_t inode, const char* leaf,
9026	char* userPath, size_t pathLength)
9027{
9028	if (!IS_USER_ADDRESS(userPath))
9029		return B_BAD_ADDRESS;
9030
9031	KPath path;
9032	if (path.InitCheck() != B_OK)
9033		return B_NO_MEMORY;
9034
9035	// copy the leaf name onto the stack
9036	char stackLeaf[B_FILE_NAME_LENGTH];
9037	if (leaf != NULL) {
9038		if (!IS_USER_ADDRESS(leaf))
9039			return B_BAD_ADDRESS;
9040
9041		int status = user_copy_name(stackLeaf, leaf, B_FILE_NAME_LENGTH);
9042		if (status != B_OK)
9043			return status;
9044
9045		leaf = stackLeaf;
9046	}
9047
9048	status_t status = vfs_entry_ref_to_path(device, inode, leaf,
9049		false, path.LockBuffer(), path.BufferSize());
9050	if (status != B_OK)
9051		return status;
9052
9053	path.UnlockBuffer();
9054
9055	int length = user_strlcpy(userPath, path.Path(), pathLength);
9056	if (length < 0)
9057		return length;
9058	if (length >= (int)pathLength)
9059		return B_BUFFER_OVERFLOW;
9060
9061	return B_OK;
9062}
9063
9064
9065status_t
9066_user_normalize_path(const char* userPath, bool traverseLink, char* buffer)
9067{
9068	if (userPath == NULL || buffer == NULL)
9069		return B_BAD_VALUE;
9070	if (!IS_USER_ADDRESS(userPath) || !IS_USER_ADDRESS(buffer))
9071		return B_BAD_ADDRESS;
9072
9073	// copy path from userland
9074	KPath pathBuffer;
9075	if (pathBuffer.InitCheck() != B_OK)
9076		return B_NO_MEMORY;
9077	char* path = pathBuffer.LockBuffer();
9078
9079	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9080	if (status != B_OK)
9081		return status;
9082
9083	status_t error = normalize_path(path, pathBuffer.BufferSize(), traverseLink,
9084		false);
9085	if (error != B_OK)
9086		return error;
9087
9088	// copy back to userland
9089	int len = user_strlcpy(buffer, path, B_PATH_NAME_LENGTH);
9090	if (len < 0)
9091		return len;
9092	if (len >= B_PATH_NAME_LENGTH)
9093		return B_BUFFER_OVERFLOW;
9094
9095	return B_OK;
9096}
9097
9098
9099int
9100_user_open_entry_ref(dev_t device, ino_t inode, const char* userName,
9101	int openMode, int perms)
9102{
9103	char name[B_FILE_NAME_LENGTH];
9104
9105	if (userName == NULL || device < 0 || inode < 0)
9106		return B_BAD_VALUE;
9107	if (!IS_USER_ADDRESS(userName))
9108		return B_BAD_ADDRESS;
9109	status_t status = user_copy_name(name, userName, sizeof(name));
9110	if (status != B_OK)
9111		return status;
9112
9113	if ((openMode & O_CREAT) != 0) {
9114		return file_create_entry_ref(device, inode, name, openMode, perms,
9115			false);
9116	}
9117
9118	return file_open_entry_ref(device, inode, name, openMode, false);
9119}
9120
9121
9122int
9123_user_open(int fd, const char* userPath, int openMode, int perms)
9124{
9125	KPath path;
9126	if (path.InitCheck() != B_OK)
9127		return B_NO_MEMORY;
9128
9129	char* buffer = path.LockBuffer();
9130
9131	if (!IS_USER_ADDRESS(userPath))
9132		return B_BAD_ADDRESS;
9133	status_t status = user_copy_name(buffer, userPath, B_PATH_NAME_LENGTH);
9134	if (status != B_OK)
9135		return status;
9136
9137	if ((openMode & O_CREAT) != 0)
9138		return file_create(fd, buffer, openMode, perms, false);
9139
9140	return file_open(fd, buffer, openMode, false);
9141}
9142
9143
9144int
9145_user_open_dir_entry_ref(dev_t device, ino_t inode, const char* userName)
9146{
9147	if (userName != NULL) {
9148		char name[B_FILE_NAME_LENGTH];
9149
9150		if (!IS_USER_ADDRESS(userName))
9151			return B_BAD_ADDRESS;
9152		status_t status = user_copy_name(name, userName, sizeof(name));
9153		if (status != B_OK)
9154			return status;
9155
9156		return dir_open_entry_ref(device, inode, name, false);
9157	}
9158	return dir_open_entry_ref(device, inode, NULL, false);
9159}
9160
9161
9162int
9163_user_open_dir(int fd, const char* userPath)
9164{
9165	if (userPath == NULL)
9166		return dir_open(fd, NULL, false);
9167
9168	KPath path;
9169	if (path.InitCheck() != B_OK)
9170		return B_NO_MEMORY;
9171
9172	char* buffer = path.LockBuffer();
9173
9174	if (!IS_USER_ADDRESS(userPath))
9175		return B_BAD_ADDRESS;
9176	status_t status = user_copy_name(buffer, userPath, B_PATH_NAME_LENGTH);
9177	if (status != B_OK)
9178		return status;
9179
9180	return dir_open(fd, buffer, false);
9181}
9182
9183
9184/*!	\brief Opens a directory's parent directory and returns the entry name
9185		   of the former.
9186
9187	Aside from that it returns the directory's entry name, this method is
9188	equivalent to \code _user_open_dir(fd, "..") \endcode. It really is
9189	equivalent, if \a userName is \c NULL.
9190
9191	If a name buffer is supplied and the name does not fit the buffer, the
9192	function fails. A buffer of size \c B_FILE_NAME_LENGTH should be safe.
9193
9194	\param fd A FD referring to a directory.
9195	\param userName Buffer the directory's entry name shall be written into.
9196		   May be \c NULL.
9197	\param nameLength Size of the name buffer.
9198	\return The file descriptor of the opened parent directory, if everything
9199			went fine, an error code otherwise.
9200*/
9201int
9202_user_open_parent_dir(int fd, char* userName, size_t nameLength)
9203{
9204	bool kernel = false;
9205
9206	if (userName && !IS_USER_ADDRESS(userName))
9207		return B_BAD_ADDRESS;
9208
9209	// open the parent dir
9210	int parentFD = dir_open(fd, (char*)"..", kernel);
9211	if (parentFD < 0)
9212		return parentFD;
9213	FDCloser fdCloser(parentFD, kernel);
9214
9215	if (userName) {
9216		// get the vnodes
9217		struct vnode* parentVNode = get_vnode_from_fd(parentFD, kernel);
9218		struct vnode* dirVNode = get_vnode_from_fd(fd, kernel);
9219		VnodePutter parentVNodePutter(parentVNode);
9220		VnodePutter dirVNodePutter(dirVNode);
9221		if (!parentVNode || !dirVNode)
9222			return B_FILE_ERROR;
9223
9224		// get the vnode name
9225		char _buffer[offsetof(struct dirent, d_name) + B_FILE_NAME_LENGTH + 1];
9226		struct dirent* buffer = (struct dirent*)_buffer;
9227		status_t status = get_vnode_name(dirVNode, parentVNode, buffer,
9228			sizeof(_buffer), get_current_io_context(false));
9229		if (status != B_OK)
9230			return status;
9231
9232		// copy the name to the userland buffer
9233		int len = user_strlcpy(userName, buffer->d_name, nameLength);
9234		if (len < 0)
9235			return len;
9236		if (len >= (int)nameLength)
9237			return B_BUFFER_OVERFLOW;
9238	}
9239
9240	return fdCloser.Detach();
9241}
9242
9243
9244status_t
9245_user_fcntl(int fd, int op, size_t argument)
9246{
9247	status_t status = common_fcntl(fd, op, argument, false);
9248	if (op == F_SETLKW)
9249		syscall_restart_handle_post(status);
9250
9251	return status;
9252}
9253
9254
9255status_t
9256_user_fsync(int fd)
9257{
9258	return common_sync(fd, false);
9259}
9260
9261
9262status_t
9263_user_flock(int fd, int operation)
9264{
9265	FUNCTION(("_user_fcntl(fd = %d, op = %d)\n", fd, operation));
9266
9267	// Check if the operation is valid
9268	switch (operation & ~LOCK_NB) {
9269		case LOCK_UN:
9270		case LOCK_SH:
9271		case LOCK_EX:
9272			break;
9273
9274		default:
9275			return B_BAD_VALUE;
9276	}
9277
9278	struct vnode* vnode;
9279	FileDescriptorPutter descriptor(get_fd_and_vnode(fd, &vnode, false));
9280	if (!descriptor.IsSet())
9281		return B_FILE_ERROR;
9282
9283	if (descriptor->ops != &sFileOps)
9284		return B_BAD_VALUE;
9285
9286	struct flock flock;
9287	flock.l_start = 0;
9288	flock.l_len = OFF_MAX;
9289	flock.l_whence = 0;
9290	flock.l_type = (operation & LOCK_SH) != 0 ? F_RDLCK : F_WRLCK;
9291
9292	status_t status;
9293	if ((operation & LOCK_UN) != 0) {
9294		if (HAS_FS_CALL(vnode, release_lock))
9295			status = FS_CALL(vnode, release_lock, descriptor->cookie, &flock);
9296		else
9297			status = release_advisory_lock(vnode, NULL, descriptor.Get(), &flock);
9298	} else {
9299		if (HAS_FS_CALL(vnode, acquire_lock)) {
9300			status = FS_CALL(vnode, acquire_lock, descriptor->cookie, &flock,
9301				(operation & LOCK_NB) == 0);
9302		} else {
9303			status = acquire_advisory_lock(vnode, NULL, descriptor.Get(), &flock,
9304				(operation & LOCK_NB) == 0);
9305		}
9306	}
9307
9308	syscall_restart_handle_post(status);
9309
9310	return status;
9311}
9312
9313
9314status_t
9315_user_lock_node(int fd)
9316{
9317	return common_lock_node(fd, false);
9318}
9319
9320
9321status_t
9322_user_unlock_node(int fd)
9323{
9324	return common_unlock_node(fd, false);
9325}
9326
9327
9328status_t
9329_user_preallocate(int fd, off_t offset, off_t length)
9330{
9331	return common_preallocate(fd, offset, length, false);
9332}
9333
9334
9335status_t
9336_user_create_dir_entry_ref(dev_t device, ino_t inode, const char* userName,
9337	int perms)
9338{
9339	char name[B_FILE_NAME_LENGTH];
9340	status_t status;
9341
9342	if (!IS_USER_ADDRESS(userName))
9343		return B_BAD_ADDRESS;
9344
9345	status = user_copy_name(name, userName, sizeof(name));
9346	if (status != B_OK)
9347		return status;
9348
9349	return dir_create_entry_ref(device, inode, name, perms, false);
9350}
9351
9352
9353status_t
9354_user_create_dir(int fd, const char* userPath, int perms)
9355{
9356	KPath pathBuffer;
9357	if (pathBuffer.InitCheck() != B_OK)
9358		return B_NO_MEMORY;
9359
9360	char* path = pathBuffer.LockBuffer();
9361
9362	if (!IS_USER_ADDRESS(userPath))
9363		return B_BAD_ADDRESS;
9364	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9365	if (status != B_OK)
9366		return status;
9367
9368	return dir_create(fd, path, perms, false);
9369}
9370
9371
9372status_t
9373_user_remove_dir(int fd, const char* userPath)
9374{
9375	KPath pathBuffer;
9376	if (pathBuffer.InitCheck() != B_OK)
9377		return B_NO_MEMORY;
9378
9379	char* path = pathBuffer.LockBuffer();
9380
9381	if (userPath != NULL) {
9382		if (!IS_USER_ADDRESS(userPath))
9383			return B_BAD_ADDRESS;
9384		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9385		if (status != B_OK)
9386			return status;
9387	}
9388
9389	return dir_remove(fd, userPath ? path : NULL, false);
9390}
9391
9392
9393status_t
9394_user_read_link(int fd, const char* userPath, char* userBuffer,
9395	size_t* userBufferSize)
9396{
9397	KPath pathBuffer, linkBuffer;
9398	if (pathBuffer.InitCheck() != B_OK || linkBuffer.InitCheck() != B_OK)
9399		return B_NO_MEMORY;
9400
9401	size_t bufferSize;
9402
9403	if (!IS_USER_ADDRESS(userBuffer) || !IS_USER_ADDRESS(userBufferSize)
9404		|| user_memcpy(&bufferSize, userBufferSize, sizeof(size_t)) != B_OK)
9405		return B_BAD_ADDRESS;
9406
9407	char* path = pathBuffer.LockBuffer();
9408	char* buffer = linkBuffer.LockBuffer();
9409
9410	if (userPath) {
9411		if (!IS_USER_ADDRESS(userPath))
9412			return B_BAD_ADDRESS;
9413		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9414		if (status != B_OK)
9415			return status;
9416
9417		if (bufferSize > B_PATH_NAME_LENGTH)
9418			bufferSize = B_PATH_NAME_LENGTH;
9419	}
9420
9421	size_t newBufferSize = bufferSize;
9422	status_t status = common_read_link(fd, userPath ? path : NULL, buffer,
9423		&newBufferSize, false);
9424
9425	// we also update the bufferSize in case of errors
9426	// (the real length will be returned in case of B_BUFFER_OVERFLOW)
9427	if (user_memcpy(userBufferSize, &newBufferSize, sizeof(size_t)) != B_OK)
9428		return B_BAD_ADDRESS;
9429
9430	if (status != B_OK)
9431		return status;
9432
9433	bufferSize = min_c(newBufferSize, bufferSize);
9434	if (user_memcpy(userBuffer, buffer, bufferSize) != B_OK)
9435		return B_BAD_ADDRESS;
9436
9437	return B_OK;
9438}
9439
9440
9441status_t
9442_user_create_symlink(int fd, const char* userPath, const char* userToPath,
9443	int mode)
9444{
9445	KPath pathBuffer;
9446	KPath toPathBuffer;
9447	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
9448		return B_NO_MEMORY;
9449
9450	char* path = pathBuffer.LockBuffer();
9451	char* toPath = toPathBuffer.LockBuffer();
9452
9453	if (!IS_USER_ADDRESS(userPath) || !IS_USER_ADDRESS(userToPath))
9454		return B_BAD_ADDRESS;
9455	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9456	if (status != B_OK)
9457		return status;
9458	status = user_copy_name(toPath, userToPath, B_PATH_NAME_LENGTH);
9459	if (status != B_OK)
9460		return status;
9461
9462	return common_create_symlink(fd, path, toPath, mode, false);
9463}
9464
9465
9466status_t
9467_user_create_link(int pathFD, const char* userPath, int toFD,
9468	const char* userToPath, bool traverseLeafLink)
9469{
9470	KPath pathBuffer;
9471	KPath toPathBuffer;
9472	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
9473		return B_NO_MEMORY;
9474
9475	char* path = pathBuffer.LockBuffer();
9476	char* toPath = toPathBuffer.LockBuffer();
9477
9478	if (!IS_USER_ADDRESS(userPath) || !IS_USER_ADDRESS(userToPath))
9479		return B_BAD_ADDRESS;
9480	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9481	if (status != B_OK)
9482		return status;
9483	status = user_copy_name(toPath, userToPath, B_PATH_NAME_LENGTH);
9484	if (status != B_OK)
9485		return status;
9486
9487	status = check_path(toPath);
9488	if (status != B_OK)
9489		return status;
9490
9491	return common_create_link(pathFD, path, toFD, toPath, traverseLeafLink,
9492		false);
9493}
9494
9495
9496status_t
9497_user_unlink(int fd, const char* userPath)
9498{
9499	KPath pathBuffer;
9500	if (pathBuffer.InitCheck() != B_OK)
9501		return B_NO_MEMORY;
9502
9503	char* path = pathBuffer.LockBuffer();
9504
9505	if (!IS_USER_ADDRESS(userPath))
9506		return B_BAD_ADDRESS;
9507	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9508	if (status != B_OK)
9509		return status;
9510
9511	return common_unlink(fd, path, false);
9512}
9513
9514
9515status_t
9516_user_rename(int oldFD, const char* userOldPath, int newFD,
9517	const char* userNewPath)
9518{
9519	KPath oldPathBuffer;
9520	KPath newPathBuffer;
9521	if (oldPathBuffer.InitCheck() != B_OK || newPathBuffer.InitCheck() != B_OK)
9522		return B_NO_MEMORY;
9523
9524	char* oldPath = oldPathBuffer.LockBuffer();
9525	char* newPath = newPathBuffer.LockBuffer();
9526
9527	if (!IS_USER_ADDRESS(userOldPath) || !IS_USER_ADDRESS(userNewPath))
9528		return B_BAD_ADDRESS;
9529	status_t status = user_copy_name(oldPath, userOldPath, B_PATH_NAME_LENGTH);
9530	if (status != B_OK)
9531		return status;
9532	status = user_copy_name(newPath, userNewPath, B_PATH_NAME_LENGTH);
9533	if (status != B_OK)
9534		return status;
9535
9536	return common_rename(oldFD, oldPath, newFD, newPath, false);
9537}
9538
9539
9540status_t
9541_user_create_fifo(int fd, const char* userPath, mode_t perms)
9542{
9543	KPath pathBuffer;
9544	if (pathBuffer.InitCheck() != B_OK)
9545		return B_NO_MEMORY;
9546
9547	char* path = pathBuffer.LockBuffer();
9548
9549	if (!IS_USER_ADDRESS(userPath))
9550		return B_BAD_ADDRESS;
9551	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9552	if (status != B_OK)
9553		return status;
9554
9555	// split into directory vnode and filename path
9556	char filename[B_FILE_NAME_LENGTH];
9557	VnodePutter dir;
9558	status = fd_and_path_to_dir_vnode(fd, path, dir, filename, false);
9559	if (status != B_OK)
9560		return status;
9561
9562	// the underlying FS needs to support creating FIFOs
9563	if (!HAS_FS_CALL(dir, create_special_node))
9564		return B_UNSUPPORTED;
9565
9566	// create the entry	-- the FIFO sub node is set up automatically
9567	fs_vnode superVnode;
9568	ino_t nodeID;
9569	status = FS_CALL(dir.Get(), create_special_node, filename, NULL,
9570		S_IFIFO | (perms & S_IUMSK), 0, &superVnode, &nodeID);
9571
9572	// create_special_node() acquired a reference for us that we don't need.
9573	if (status == B_OK)
9574		put_vnode(dir->mount->volume, nodeID);
9575
9576	return status;
9577}
9578
9579
9580status_t
9581_user_create_pipe(int* userFDs)
9582{
9583	// rootfs should support creating FIFOs, but let's be sure
9584	if (!HAS_FS_CALL(sRoot, create_special_node))
9585		return B_UNSUPPORTED;
9586
9587	// create the node	-- the FIFO sub node is set up automatically
9588	fs_vnode superVnode;
9589	ino_t nodeID;
9590	status_t status = FS_CALL(sRoot, create_special_node, NULL, NULL,
9591		S_IFIFO | S_IRUSR | S_IWUSR, 0, &superVnode, &nodeID);
9592	if (status != B_OK)
9593		return status;
9594
9595	// We've got one reference to the node and need another one.
9596	struct vnode* vnode;
9597	status = get_vnode(sRoot->mount->id, nodeID, &vnode, true, false);
9598	if (status != B_OK) {
9599		// that should not happen
9600		dprintf("_user_create_pipe(): Failed to lookup vnode (%" B_PRIdDEV ", "
9601			"%" B_PRIdINO ")\n", sRoot->mount->id, sRoot->id);
9602		return status;
9603	}
9604
9605	// Everything looks good so far. Open two FDs for reading respectively
9606	// writing.
9607	int fds[2];
9608	fds[0] = open_vnode(vnode, O_RDONLY, false);
9609	fds[1] = open_vnode(vnode, O_WRONLY, false);
9610
9611	FDCloser closer0(fds[0], false);
9612	FDCloser closer1(fds[1], false);
9613
9614	status = (fds[0] >= 0 ? (fds[1] >= 0 ? B_OK : fds[1]) : fds[0]);
9615
9616	// copy FDs to userland
9617	if (status == B_OK) {
9618		if (!IS_USER_ADDRESS(userFDs)
9619			|| user_memcpy(userFDs, fds, sizeof(fds)) != B_OK) {
9620			status = B_BAD_ADDRESS;
9621		}
9622	}
9623
9624	// keep FDs, if everything went fine
9625	if (status == B_OK) {
9626		closer0.Detach();
9627		closer1.Detach();
9628	}
9629
9630	return status;
9631}
9632
9633
9634status_t
9635_user_access(int fd, const char* userPath, int mode, bool effectiveUserGroup)
9636{
9637	KPath pathBuffer;
9638	if (pathBuffer.InitCheck() != B_OK)
9639		return B_NO_MEMORY;
9640
9641	char* path = pathBuffer.LockBuffer();
9642
9643	if (!IS_USER_ADDRESS(userPath))
9644		return B_BAD_ADDRESS;
9645	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9646	if (status != B_OK)
9647		return status;
9648
9649	return common_access(fd, path, mode, effectiveUserGroup, false);
9650}
9651
9652
9653status_t
9654_user_read_stat(int fd, const char* userPath, bool traverseLink,
9655	struct stat* userStat, size_t statSize)
9656{
9657	struct stat stat = {0};
9658	status_t status;
9659
9660	if (statSize > sizeof(struct stat))
9661		return B_BAD_VALUE;
9662
9663	if (!IS_USER_ADDRESS(userStat))
9664		return B_BAD_ADDRESS;
9665
9666	if (userPath != NULL) {
9667		// path given: get the stat of the node referred to by (fd, path)
9668		if (!IS_USER_ADDRESS(userPath))
9669			return B_BAD_ADDRESS;
9670
9671		KPath pathBuffer;
9672		if (pathBuffer.InitCheck() != B_OK)
9673			return B_NO_MEMORY;
9674
9675		char* path = pathBuffer.LockBuffer();
9676
9677		status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9678		if (status != B_OK)
9679			return status;
9680
9681		status = common_path_read_stat(fd, path, traverseLink, &stat, false);
9682	} else {
9683		// no path given: get the FD and use the FD operation
9684		FileDescriptorPutter descriptor
9685			(get_fd(get_current_io_context(false), fd));
9686		if (!descriptor.IsSet())
9687			return B_FILE_ERROR;
9688
9689		if (descriptor->ops->fd_read_stat)
9690			status = descriptor->ops->fd_read_stat(descriptor.Get(), &stat);
9691		else
9692			status = B_UNSUPPORTED;
9693	}
9694
9695	if (status != B_OK)
9696		return status;
9697
9698	return user_memcpy(userStat, &stat, statSize);
9699}
9700
9701
9702status_t
9703_user_write_stat(int fd, const char* userPath, bool traverseLeafLink,
9704	const struct stat* userStat, size_t statSize, int statMask)
9705{
9706	if (statSize > sizeof(struct stat))
9707		return B_BAD_VALUE;
9708
9709	struct stat stat;
9710
9711	if (!IS_USER_ADDRESS(userStat)
9712		|| user_memcpy(&stat, userStat, statSize) < B_OK)
9713		return B_BAD_ADDRESS;
9714
9715	// clear additional stat fields
9716	if (statSize < sizeof(struct stat))
9717		memset((uint8*)&stat + statSize, 0, sizeof(struct stat) - statSize);
9718
9719	status_t status;
9720
9721	if (userPath != NULL) {
9722		// path given: write the stat of the node referred to by (fd, path)
9723		if (!IS_USER_ADDRESS(userPath))
9724			return B_BAD_ADDRESS;
9725
9726		KPath pathBuffer;
9727		if (pathBuffer.InitCheck() != B_OK)
9728			return B_NO_MEMORY;
9729
9730		char* path = pathBuffer.LockBuffer();
9731
9732		status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9733		if (status != B_OK)
9734			return status;
9735
9736		status = common_path_write_stat(fd, path, traverseLeafLink, &stat,
9737			statMask, false);
9738	} else {
9739		// no path given: get the FD and use the FD operation
9740		FileDescriptorPutter descriptor
9741			(get_fd(get_current_io_context(false), fd));
9742		if (!descriptor.IsSet())
9743			return B_FILE_ERROR;
9744
9745		if (descriptor->ops->fd_write_stat) {
9746			status = descriptor->ops->fd_write_stat(descriptor.Get(), &stat,
9747				statMask);
9748		} else
9749			status = B_UNSUPPORTED;
9750	}
9751
9752	return status;
9753}
9754
9755
9756int
9757_user_open_attr_dir(int fd, const char* userPath, bool traverseLeafLink)
9758{
9759	KPath pathBuffer;
9760	if (pathBuffer.InitCheck() != B_OK)
9761		return B_NO_MEMORY;
9762
9763	char* path = pathBuffer.LockBuffer();
9764
9765	if (userPath != NULL) {
9766		if (!IS_USER_ADDRESS(userPath))
9767			return B_BAD_ADDRESS;
9768		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9769		if (status != B_OK)
9770			return status;
9771	}
9772
9773	return attr_dir_open(fd, userPath ? path : NULL, traverseLeafLink, false);
9774}
9775
9776
9777ssize_t
9778_user_read_attr(int fd, const char* userAttribute, off_t pos, void* userBuffer,
9779	size_t readBytes)
9780{
9781	char attribute[B_FILE_NAME_LENGTH];
9782
9783	if (userAttribute == NULL)
9784		return B_BAD_VALUE;
9785	if (!IS_USER_ADDRESS(userAttribute))
9786		return B_BAD_ADDRESS;
9787	status_t status = user_copy_name(attribute, userAttribute, sizeof(attribute));
9788	if (status != B_OK)
9789		return status;
9790
9791	int attr = attr_open(fd, NULL, attribute, O_RDONLY, false);
9792	if (attr < 0)
9793		return attr;
9794
9795	ssize_t bytes = _user_read(attr, pos, userBuffer, readBytes);
9796	_user_close(attr);
9797
9798	return bytes;
9799}
9800
9801
9802ssize_t
9803_user_write_attr(int fd, const char* userAttribute, uint32 type, off_t pos,
9804	const void* buffer, size_t writeBytes)
9805{
9806	char attribute[B_FILE_NAME_LENGTH];
9807
9808	if (userAttribute == NULL)
9809		return B_BAD_VALUE;
9810	if (!IS_USER_ADDRESS(userAttribute))
9811		return B_BAD_ADDRESS;
9812	status_t status = user_copy_name(attribute, userAttribute, sizeof(attribute));
9813	if (status != B_OK)
9814		return status;
9815
9816	// Try to support the BeOS typical truncation as well as the position
9817	// argument
9818	int attr = attr_create(fd, NULL, attribute, type,
9819		O_CREAT | O_WRONLY | (pos != 0 ? 0 : O_TRUNC), false);
9820	if (attr < 0)
9821		return attr;
9822
9823	ssize_t bytes = _user_write(attr, pos, buffer, writeBytes);
9824	_user_close(attr);
9825
9826	return bytes;
9827}
9828
9829
9830status_t
9831_user_stat_attr(int fd, const char* userAttribute,
9832	struct attr_info* userAttrInfo)
9833{
9834	char attribute[B_FILE_NAME_LENGTH];
9835
9836	if (userAttribute == NULL || userAttrInfo == NULL)
9837		return B_BAD_VALUE;
9838	if (!IS_USER_ADDRESS(userAttribute) || !IS_USER_ADDRESS(userAttrInfo))
9839		return B_BAD_ADDRESS;
9840	status_t status = user_copy_name(attribute, userAttribute,
9841		sizeof(attribute));
9842	if (status != B_OK)
9843		return status;
9844
9845	int attr = attr_open(fd, NULL, attribute, O_RDONLY, false);
9846	if (attr < 0)
9847		return attr;
9848
9849	struct file_descriptor* descriptor
9850		= get_fd(get_current_io_context(false), attr);
9851	if (descriptor == NULL) {
9852		_user_close(attr);
9853		return B_FILE_ERROR;
9854	}
9855
9856	struct stat stat;
9857	if (descriptor->ops->fd_read_stat)
9858		status = descriptor->ops->fd_read_stat(descriptor, &stat);
9859	else
9860		status = B_UNSUPPORTED;
9861
9862	put_fd(descriptor);
9863	_user_close(attr);
9864
9865	if (status == B_OK) {
9866		attr_info info;
9867		info.type = stat.st_type;
9868		info.size = stat.st_size;
9869
9870		if (user_memcpy(userAttrInfo, &info, sizeof(struct attr_info)) != B_OK)
9871			return B_BAD_ADDRESS;
9872	}
9873
9874	return status;
9875}
9876
9877
9878int
9879_user_open_attr(int fd, const char* userPath, const char* userName,
9880	uint32 type, int openMode)
9881{
9882	char name[B_FILE_NAME_LENGTH];
9883
9884	if (!IS_USER_ADDRESS(userName))
9885		return B_BAD_ADDRESS;
9886	status_t status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
9887	if (status != B_OK)
9888		return status;
9889
9890	KPath pathBuffer;
9891	if (pathBuffer.InitCheck() != B_OK)
9892		return B_NO_MEMORY;
9893
9894	char* path = pathBuffer.LockBuffer();
9895
9896	if (userPath != NULL) {
9897		if (!IS_USER_ADDRESS(userPath))
9898			return B_BAD_ADDRESS;
9899		status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9900		if (status != B_OK)
9901			return status;
9902	}
9903
9904	if ((openMode & O_CREAT) != 0) {
9905		return attr_create(fd, userPath ? path : NULL, name, type, openMode,
9906			false);
9907	}
9908
9909	return attr_open(fd, userPath ? path : NULL, name, openMode, false);
9910}
9911
9912
9913status_t
9914_user_remove_attr(int fd, const char* userName)
9915{
9916	char name[B_FILE_NAME_LENGTH];
9917
9918	if (!IS_USER_ADDRESS(userName))
9919		return B_BAD_ADDRESS;
9920	status_t status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
9921	if (status != B_OK)
9922		return status;
9923
9924	return attr_remove(fd, name, false);
9925}
9926
9927
9928status_t
9929_user_rename_attr(int fromFile, const char* userFromName, int toFile,
9930	const char* userToName)
9931{
9932	if (!IS_USER_ADDRESS(userFromName)
9933		|| !IS_USER_ADDRESS(userToName))
9934		return B_BAD_ADDRESS;
9935
9936	KPath fromNameBuffer(B_FILE_NAME_LENGTH);
9937	KPath toNameBuffer(B_FILE_NAME_LENGTH);
9938	if (fromNameBuffer.InitCheck() != B_OK || toNameBuffer.InitCheck() != B_OK)
9939		return B_NO_MEMORY;
9940
9941	char* fromName = fromNameBuffer.LockBuffer();
9942	char* toName = toNameBuffer.LockBuffer();
9943
9944	status_t status = user_copy_name(fromName, userFromName, B_FILE_NAME_LENGTH);
9945	if (status != B_OK)
9946		return status;
9947	status = user_copy_name(toName, userToName, B_FILE_NAME_LENGTH);
9948	if (status != B_OK)
9949		return status;
9950
9951	return attr_rename(fromFile, fromName, toFile, toName, false);
9952}
9953
9954
9955int
9956_user_open_index_dir(dev_t device)
9957{
9958	return index_dir_open(device, false);
9959}
9960
9961
9962status_t
9963_user_create_index(dev_t device, const char* userName, uint32 type,
9964	uint32 flags)
9965{
9966	char name[B_FILE_NAME_LENGTH];
9967
9968	if (!IS_USER_ADDRESS(userName))
9969		return B_BAD_ADDRESS;
9970	status_t status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
9971	if (status != B_OK)
9972		return status;
9973
9974	return index_create(device, name, type, flags, false);
9975}
9976
9977
9978status_t
9979_user_read_index_stat(dev_t device, const char* userName, struct stat* userStat)
9980{
9981	char name[B_FILE_NAME_LENGTH];
9982	struct stat stat = {0};
9983	status_t status;
9984
9985	if (!IS_USER_ADDRESS(userName) || !IS_USER_ADDRESS(userStat))
9986		return B_BAD_ADDRESS;
9987	status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
9988	if (status != B_OK)
9989		return status;
9990
9991	status = index_name_read_stat(device, name, &stat, false);
9992	if (status == B_OK) {
9993		if (user_memcpy(userStat, &stat, sizeof(stat)) != B_OK)
9994			return B_BAD_ADDRESS;
9995	}
9996
9997	return status;
9998}
9999
10000
10001status_t
10002_user_remove_index(dev_t device, const char* userName)
10003{
10004	char name[B_FILE_NAME_LENGTH];
10005
10006	if (!IS_USER_ADDRESS(userName))
10007		return B_BAD_ADDRESS;
10008	status_t status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
10009	if (status != B_OK)
10010		return status;
10011
10012	return index_remove(device, name, false);
10013}
10014
10015
10016status_t
10017_user_getcwd(char* userBuffer, size_t size)
10018{
10019	if (size == 0)
10020		return B_BAD_VALUE;
10021	if (!IS_USER_ADDRESS(userBuffer))
10022		return B_BAD_ADDRESS;
10023
10024	if (size > kMaxPathLength)
10025		size = kMaxPathLength;
10026
10027	KPath pathBuffer(size);
10028	if (pathBuffer.InitCheck() != B_OK)
10029		return B_NO_MEMORY;
10030
10031	TRACE(("user_getcwd: buf %p, %ld\n", userBuffer, size));
10032
10033	char* path = pathBuffer.LockBuffer();
10034
10035	status_t status = get_cwd(path, size, false);
10036	if (status != B_OK)
10037		return status;
10038
10039	// Copy back the result
10040	if (user_strlcpy(userBuffer, path, size) < B_OK)
10041		return B_BAD_ADDRESS;
10042
10043	return status;
10044}
10045
10046
10047status_t
10048_user_setcwd(int fd, const char* userPath)
10049{
10050	TRACE(("user_setcwd: path = %p\n", userPath));
10051
10052	KPath pathBuffer;
10053	if (pathBuffer.InitCheck() != B_OK)
10054		return B_NO_MEMORY;
10055
10056	char* path = pathBuffer.LockBuffer();
10057
10058	if (userPath != NULL) {
10059		if (!IS_USER_ADDRESS(userPath))
10060			return B_BAD_ADDRESS;
10061		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
10062		if (status != B_OK)
10063			return status;
10064	}
10065
10066	return set_cwd(fd, userPath != NULL ? path : NULL, false);
10067}
10068
10069
10070status_t
10071_user_change_root(const char* userPath)
10072{
10073	// only root is allowed to chroot()
10074	if (geteuid() != 0)
10075		return B_NOT_ALLOWED;
10076
10077	// alloc path buffer
10078	KPath pathBuffer;
10079	if (pathBuffer.InitCheck() != B_OK)
10080		return B_NO_MEMORY;
10081
10082	// copy userland path to kernel
10083	char* path = pathBuffer.LockBuffer();
10084	if (userPath != NULL) {
10085		if (!IS_USER_ADDRESS(userPath))
10086			return B_BAD_ADDRESS;
10087		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
10088		if (status != B_OK)
10089			return status;
10090	}
10091
10092	// get the vnode
10093	VnodePutter vnode;
10094	status_t status = path_to_vnode(path, true, vnode, NULL, false);
10095	if (status != B_OK)
10096		return status;
10097
10098	// set the new root
10099	struct io_context* context = get_current_io_context(false);
10100	mutex_lock(&sIOContextRootLock);
10101	struct vnode* oldRoot = context->root;
10102	context->root = vnode.Detach();
10103	mutex_unlock(&sIOContextRootLock);
10104
10105	put_vnode(oldRoot);
10106
10107	return B_OK;
10108}
10109
10110
10111int
10112_user_open_query(dev_t device, const char* userQuery, size_t queryLength,
10113	uint32 flags, port_id port, int32 token)
10114{
10115	if (device < 0 || userQuery == NULL || queryLength == 0)
10116		return B_BAD_VALUE;
10117
10118	if (!IS_USER_ADDRESS(userQuery))
10119		return B_BAD_ADDRESS;
10120
10121	// this is a safety restriction
10122	if (queryLength >= 65536)
10123		return B_NAME_TOO_LONG;
10124
10125	BStackOrHeapArray<char, 128> query(queryLength + 1);
10126	if (!query.IsValid())
10127		return B_NO_MEMORY;
10128
10129	if (user_strlcpy(query, userQuery, queryLength + 1) < B_OK)
10130		return B_BAD_ADDRESS;
10131
10132	return query_open(device, query, flags, port, token, false);
10133}
10134
10135
10136#include "vfs_request_io.cpp"
10137