1/*
2 * Copyright 2005-2013, Ingo Weinhold, ingo_weinhold@gmx.de.
3 * Copyright 2002-2018, Axel D��rfler, axeld@pinc-software.de.
4 * Distributed under the terms of the MIT License.
5 *
6 * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
7 * Distributed under the terms of the NewOS License.
8 */
9
10
11/*! Virtual File System and File System Interface Layer */
12
13
14#include <ctype.h>
15#include <fcntl.h>
16#include <limits.h>
17#include <stddef.h>
18#include <stdio.h>
19#include <string.h>
20#include <sys/file.h>
21#include <sys/ioctl.h>
22#include <sys/resource.h>
23#include <sys/stat.h>
24#include <unistd.h>
25
26#include <fs_attr.h>
27#include <fs_info.h>
28#include <fs_interface.h>
29#include <fs_volume.h>
30#include <NodeMonitor.h>
31#include <OS.h>
32#include <StorageDefs.h>
33
34#include <AutoDeleter.h>
35#include <AutoDeleterDrivers.h>
36#include <block_cache.h>
37#include <boot/kernel_args.h>
38#include <debug_heap.h>
39#include <disk_device_manager/KDiskDevice.h>
40#include <disk_device_manager/KDiskDeviceManager.h>
41#include <disk_device_manager/KDiskDeviceUtils.h>
42#include <disk_device_manager/KDiskSystem.h>
43#include <fd.h>
44#include <file_cache.h>
45#include <fs/node_monitor.h>
46#include <KPath.h>
47#include <lock.h>
48#include <low_resource_manager.h>
49#include <slab/Slab.h>
50#include <StackOrHeapArray.h>
51#include <syscalls.h>
52#include <syscall_restart.h>
53#include <tracing.h>
54#include <util/atomic.h>
55#include <util/AutoLock.h>
56#include <util/ThreadAutoLock.h>
57#include <util/DoublyLinkedList.h>
58#include <vfs.h>
59#include <vm/vm.h>
60#include <vm/VMCache.h>
61#include <wait_for_objects.h>
62
63#include "EntryCache.h"
64#include "fifo.h"
65#include "IORequest.h"
66#include "unused_vnodes.h"
67#include "vfs_tracing.h"
68#include "Vnode.h"
69#include "../cache/vnode_store.h"
70
71
72//#define TRACE_VFS
73#ifdef TRACE_VFS
74#	define TRACE(x) dprintf x
75#	define FUNCTION(x) dprintf x
76#else
77#	define TRACE(x) ;
78#	define FUNCTION(x) ;
79#endif
80
81#define ADD_DEBUGGER_COMMANDS
82
83
84#define HAS_FS_CALL(vnode, op)			(vnode->ops->op != NULL)
85#define HAS_FS_MOUNT_CALL(mount, op)	(mount->volume->ops->op != NULL)
86
87#if KDEBUG
88#	define FS_CALL(vnode, op, params...) \
89		( HAS_FS_CALL(vnode, op) ? \
90			vnode->ops->op(vnode->mount->volume, vnode, params) \
91			: (panic("FS_CALL: vnode %p op " #op " is NULL", vnode), 0))
92#	define FS_CALL_NO_PARAMS(vnode, op) \
93		( HAS_FS_CALL(vnode, op) ? \
94			vnode->ops->op(vnode->mount->volume, vnode) \
95			: (panic("FS_CALL_NO_PARAMS: vnode %p op " #op " is NULL", vnode), 0))
96#	define FS_MOUNT_CALL(mount, op, params...) \
97		( HAS_FS_MOUNT_CALL(mount, op) ? \
98			mount->volume->ops->op(mount->volume, params) \
99			: (panic("FS_MOUNT_CALL: mount %p op " #op " is NULL", mount), 0))
100#	define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
101		( HAS_FS_MOUNT_CALL(mount, op) ? \
102			mount->volume->ops->op(mount->volume) \
103			: (panic("FS_MOUNT_CALL_NO_PARAMS: mount %p op " #op " is NULL", mount), 0))
104#else
105#	define FS_CALL(vnode, op, params...) \
106			vnode->ops->op(vnode->mount->volume, vnode, params)
107#	define FS_CALL_NO_PARAMS(vnode, op) \
108			vnode->ops->op(vnode->mount->volume, vnode)
109#	define FS_MOUNT_CALL(mount, op, params...) \
110			mount->volume->ops->op(mount->volume, params)
111#	define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
112			mount->volume->ops->op(mount->volume)
113#endif
114
115
116const static size_t kMaxPathLength = 65536;
117	// The absolute maximum path length (for getcwd() - this is not depending
118	// on PATH_MAX
119
120
121typedef DoublyLinkedList<vnode> VnodeList;
122
123/*!	\brief Structure to manage a mounted file system
124
125	Note: The root_vnode and root_vnode->covers fields (what others?) are
126	initialized in fs_mount() and not changed afterwards. That is as soon
127	as the mount is mounted and it is made sure it won't be unmounted
128	(e.g. by holding a reference to a vnode of that mount) (read) access
129	to those fields is always safe, even without additional locking. Morever
130	while mounted the mount holds a reference to the root_vnode->covers vnode,
131	and thus making the access path vnode->mount->root_vnode->covers->mount->...
132	safe if a reference to vnode is held (note that for the root mount
133	root_vnode->covers is NULL, though).
134*/
135struct fs_mount {
136	fs_mount()
137		:
138		volume(NULL),
139		device_name(NULL)
140	{
141		mutex_init(&lock, "mount lock");
142	}
143
144	~fs_mount()
145	{
146		mutex_destroy(&lock);
147		free(device_name);
148
149		while (volume) {
150			fs_volume* superVolume = volume->super_volume;
151
152			if (volume->file_system != NULL)
153				put_module(volume->file_system->info.name);
154
155			free(volume->file_system_name);
156			free(volume);
157			volume = superVolume;
158		}
159	}
160
161	struct fs_mount* next;
162	dev_t			id;
163	fs_volume*		volume;
164	char*			device_name;
165	mutex			lock;	// guards the vnodes list
166	struct vnode*	root_vnode;
167	struct vnode*	covers_vnode;	// immutable
168	KPartition*		partition;
169	VnodeList		vnodes;
170	EntryCache		entry_cache;
171	bool			unmounting;
172	bool			owns_file_device;
173};
174
175
176namespace {
177
178struct advisory_lock : public DoublyLinkedListLinkImpl<advisory_lock> {
179	list_link		link;
180	void*			bound_to;
181	team_id			team;
182	pid_t			session;
183	off_t			start;
184	off_t			end;
185	bool			shared;
186};
187
188typedef DoublyLinkedList<advisory_lock> LockList;
189
190} // namespace
191
192
193struct advisory_locking {
194	sem_id			lock;
195	sem_id			wait_sem;
196	LockList		locks;
197
198	advisory_locking()
199		:
200		lock(-1),
201		wait_sem(-1)
202	{
203	}
204
205	~advisory_locking()
206	{
207		if (lock >= 0)
208			delete_sem(lock);
209		if (wait_sem >= 0)
210			delete_sem(wait_sem);
211	}
212};
213
214/*!	\brief Guards sMountsTable.
215
216	The holder is allowed to read/write access the sMountsTable.
217	Manipulation of the fs_mount structures themselves
218	(and their destruction) requires different locks though.
219*/
220static rw_lock sMountLock = RW_LOCK_INITIALIZER("vfs_mount_lock");
221
222/*!	\brief Guards mount/unmount operations.
223
224	The fs_mount() and fs_unmount() hold the lock during their whole operation.
225	That is locking the lock ensures that no FS is mounted/unmounted. In
226	particular this means that
227	- sMountsTable will not be modified,
228	- the fields immutable after initialization of the fs_mount structures in
229	  sMountsTable will not be modified,
230
231	The thread trying to lock the lock must not hold sVnodeLock or
232	sMountLock.
233*/
234static recursive_lock sMountOpLock;
235
236/*!	\brief Guards sVnodeTable.
237
238	The holder is allowed read/write access to sVnodeTable and to
239	any unbusy vnode in that table, save to the immutable fields (device, id,
240	private_node, mount) to which only read-only access is allowed.
241	The mutable fields advisory_locking, mandatory_locked_by, and ref_count, as
242	well as the busy, removed, unused flags, and the vnode's type can also be
243	write accessed when holding a read lock to sVnodeLock *and* having the vnode
244	locked. Write access to covered_by and covers requires to write lock
245	sVnodeLock.
246
247	The thread trying to acquire the lock must not hold sMountLock.
248	You must not hold this lock when calling create_sem(), as this might call
249	vfs_free_unused_vnodes() and thus cause a deadlock.
250*/
251static rw_lock sVnodeLock = RW_LOCK_INITIALIZER("vfs_vnode_lock");
252
253/*!	\brief Guards io_context::root.
254
255	Must be held when setting or getting the io_context::root field.
256	The only operation allowed while holding this lock besides getting or
257	setting the field is inc_vnode_ref_count() on io_context::root.
258*/
259static mutex sIOContextRootLock = MUTEX_INITIALIZER("io_context::root lock");
260
261
262namespace {
263
264struct vnode_hash_key {
265	dev_t	device;
266	ino_t	vnode;
267};
268
269struct VnodeHash {
270	typedef vnode_hash_key	KeyType;
271	typedef	struct vnode	ValueType;
272
273#define VHASH(mountid, vnodeid) \
274	(((uint32)((vnodeid) >> 32) + (uint32)(vnodeid)) ^ (uint32)(mountid))
275
276	size_t HashKey(KeyType key) const
277	{
278		return VHASH(key.device, key.vnode);
279	}
280
281	size_t Hash(ValueType* vnode) const
282	{
283		return VHASH(vnode->device, vnode->id);
284	}
285
286#undef VHASH
287
288	bool Compare(KeyType key, ValueType* vnode) const
289	{
290		return vnode->device == key.device && vnode->id == key.vnode;
291	}
292
293	ValueType*& GetLink(ValueType* value) const
294	{
295		return value->next;
296	}
297};
298
299typedef BOpenHashTable<VnodeHash> VnodeTable;
300
301
302struct MountHash {
303	typedef dev_t			KeyType;
304	typedef	struct fs_mount	ValueType;
305
306	size_t HashKey(KeyType key) const
307	{
308		return key;
309	}
310
311	size_t Hash(ValueType* mount) const
312	{
313		return mount->id;
314	}
315
316	bool Compare(KeyType key, ValueType* mount) const
317	{
318		return mount->id == key;
319	}
320
321	ValueType*& GetLink(ValueType* value) const
322	{
323		return value->next;
324	}
325};
326
327typedef BOpenHashTable<MountHash> MountTable;
328
329} // namespace
330
331
332object_cache* sPathNameCache;
333object_cache* sVnodeCache;
334object_cache* sFileDescriptorCache;
335
336#define VNODE_HASH_TABLE_SIZE 1024
337static VnodeTable* sVnodeTable;
338static struct vnode* sRoot;
339
340#define MOUNTS_HASH_TABLE_SIZE 16
341static MountTable* sMountsTable;
342static dev_t sNextMountID = 1;
343
344#define MAX_TEMP_IO_VECS 8
345
346// How long to wait for busy vnodes (10s)
347#define BUSY_VNODE_RETRIES 2000
348#define BUSY_VNODE_DELAY 5000
349
350mode_t __gUmask = 022;
351
352/* function declarations */
353
354static void free_unused_vnodes();
355
356// file descriptor operation prototypes
357static status_t file_read(struct file_descriptor* descriptor, off_t pos,
358	void* buffer, size_t* _bytes);
359static status_t file_write(struct file_descriptor* descriptor, off_t pos,
360	const void* buffer, size_t* _bytes);
361static off_t file_seek(struct file_descriptor* descriptor, off_t pos,
362	int seekType);
363static void file_free_fd(struct file_descriptor* descriptor);
364static status_t file_close(struct file_descriptor* descriptor);
365static status_t file_select(struct file_descriptor* descriptor, uint8 event,
366	struct selectsync* sync);
367static status_t file_deselect(struct file_descriptor* descriptor, uint8 event,
368	struct selectsync* sync);
369static status_t dir_read(struct io_context* context,
370	struct file_descriptor* descriptor, struct dirent* buffer,
371	size_t bufferSize, uint32* _count);
372static status_t dir_read(struct io_context* ioContext, struct vnode* vnode,
373	void* cookie, struct dirent* buffer, size_t bufferSize, uint32* _count);
374static status_t dir_rewind(struct file_descriptor* descriptor);
375static void dir_free_fd(struct file_descriptor* descriptor);
376static status_t dir_close(struct file_descriptor* descriptor);
377static status_t attr_dir_read(struct io_context* context,
378	struct file_descriptor* descriptor, struct dirent* buffer,
379	size_t bufferSize, uint32* _count);
380static status_t attr_dir_rewind(struct file_descriptor* descriptor);
381static void attr_dir_free_fd(struct file_descriptor* descriptor);
382static status_t attr_dir_close(struct file_descriptor* descriptor);
383static status_t attr_read(struct file_descriptor* descriptor, off_t pos,
384	void* buffer, size_t* _bytes);
385static status_t attr_write(struct file_descriptor* descriptor, off_t pos,
386	const void* buffer, size_t* _bytes);
387static off_t attr_seek(struct file_descriptor* descriptor, off_t pos,
388	int seekType);
389static void attr_free_fd(struct file_descriptor* descriptor);
390static status_t attr_close(struct file_descriptor* descriptor);
391static status_t attr_read_stat(struct file_descriptor* descriptor,
392	struct stat* statData);
393static status_t attr_write_stat(struct file_descriptor* descriptor,
394	const struct stat* stat, int statMask);
395static status_t index_dir_read(struct io_context* context,
396	struct file_descriptor* descriptor, struct dirent* buffer,
397	size_t bufferSize, uint32* _count);
398static status_t index_dir_rewind(struct file_descriptor* descriptor);
399static void index_dir_free_fd(struct file_descriptor* descriptor);
400static status_t index_dir_close(struct file_descriptor* descriptor);
401static status_t query_read(struct io_context* context,
402	struct file_descriptor* descriptor, struct dirent* buffer,
403	size_t bufferSize, uint32* _count);
404static status_t query_rewind(struct file_descriptor* descriptor);
405static void query_free_fd(struct file_descriptor* descriptor);
406static status_t query_close(struct file_descriptor* descriptor);
407
408static status_t common_ioctl(struct file_descriptor* descriptor, ulong op,
409	void* buffer, size_t length);
410static status_t common_read_stat(struct file_descriptor* descriptor,
411	struct stat* statData);
412static status_t common_write_stat(struct file_descriptor* descriptor,
413	const struct stat* statData, int statMask);
414static status_t common_path_read_stat(int fd, char* path, bool traverseLeafLink,
415	struct stat* stat, bool kernel);
416
417static status_t vnode_path_to_vnode(struct vnode* vnode, char* path,
418	bool traverseLeafLink, bool kernel,
419	VnodePutter& _vnode, ino_t* _parentID, char* leafName = NULL);
420static status_t dir_vnode_to_path(struct vnode* vnode, char* buffer,
421	size_t bufferSize, bool kernel);
422static status_t fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
423	VnodePutter& _vnode, ino_t* _parentID, bool kernel);
424static void inc_vnode_ref_count(struct vnode* vnode);
425static status_t dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree,
426	bool reenter);
427static inline void put_vnode(struct vnode* vnode);
428static status_t fs_unmount(char* path, dev_t mountID, uint32 flags,
429	bool kernel);
430static int open_vnode(struct vnode* vnode, int openMode, bool kernel);
431
432
433static struct fd_ops sFileOps = {
434	file_read,
435	file_write,
436	file_seek,
437	common_ioctl,
438	NULL,		// set_flags
439	file_select,
440	file_deselect,
441	NULL,		// read_dir()
442	NULL,		// rewind_dir()
443	common_read_stat,
444	common_write_stat,
445	file_close,
446	file_free_fd
447};
448
449static struct fd_ops sDirectoryOps = {
450	NULL,		// read()
451	NULL,		// write()
452	NULL,		// seek()
453	common_ioctl,
454	NULL,		// set_flags
455	NULL,		// select()
456	NULL,		// deselect()
457	dir_read,
458	dir_rewind,
459	common_read_stat,
460	common_write_stat,
461	dir_close,
462	dir_free_fd
463};
464
465static struct fd_ops sAttributeDirectoryOps = {
466	NULL,		// read()
467	NULL,		// write()
468	NULL,		// seek()
469	common_ioctl,
470	NULL,		// set_flags
471	NULL,		// select()
472	NULL,		// deselect()
473	attr_dir_read,
474	attr_dir_rewind,
475	common_read_stat,
476	common_write_stat,
477	attr_dir_close,
478	attr_dir_free_fd
479};
480
481static struct fd_ops sAttributeOps = {
482	attr_read,
483	attr_write,
484	attr_seek,
485	common_ioctl,
486	NULL,		// set_flags
487	NULL,		// select()
488	NULL,		// deselect()
489	NULL,		// read_dir()
490	NULL,		// rewind_dir()
491	attr_read_stat,
492	attr_write_stat,
493	attr_close,
494	attr_free_fd
495};
496
497static struct fd_ops sIndexDirectoryOps = {
498	NULL,		// read()
499	NULL,		// write()
500	NULL,		// seek()
501	NULL,		// ioctl()
502	NULL,		// set_flags
503	NULL,		// select()
504	NULL,		// deselect()
505	index_dir_read,
506	index_dir_rewind,
507	NULL,		// read_stat()
508	NULL,		// write_stat()
509	index_dir_close,
510	index_dir_free_fd
511};
512
513#if 0
514static struct fd_ops sIndexOps = {
515	NULL,		// read()
516	NULL,		// write()
517	NULL,		// seek()
518	NULL,		// ioctl()
519	NULL,		// set_flags
520	NULL,		// select()
521	NULL,		// deselect()
522	NULL,		// dir_read()
523	NULL,		// dir_rewind()
524	index_read_stat,	// read_stat()
525	NULL,		// write_stat()
526	NULL,		// dir_close()
527	NULL		// free_fd()
528};
529#endif
530
531static struct fd_ops sQueryOps = {
532	NULL,		// read()
533	NULL,		// write()
534	NULL,		// seek()
535	NULL,		// ioctl()
536	NULL,		// set_flags
537	NULL,		// select()
538	NULL,		// deselect()
539	query_read,
540	query_rewind,
541	NULL,		// read_stat()
542	NULL,		// write_stat()
543	query_close,
544	query_free_fd
545};
546
547
548namespace {
549
550class FDCloser {
551public:
552	FDCloser() : fFD(-1), fKernel(true) {}
553
554	FDCloser(int fd, bool kernel) : fFD(fd), fKernel(kernel) {}
555
556	~FDCloser()
557	{
558		Close();
559	}
560
561	void SetTo(int fd, bool kernel)
562	{
563		Close();
564		fFD = fd;
565		fKernel = kernel;
566	}
567
568	void Close()
569	{
570		if (fFD >= 0) {
571			if (fKernel)
572				_kern_close(fFD);
573			else
574				_user_close(fFD);
575			fFD = -1;
576		}
577	}
578
579	int Detach()
580	{
581		int fd = fFD;
582		fFD = -1;
583		return fd;
584	}
585
586private:
587	int		fFD;
588	bool	fKernel;
589};
590
591} // namespace
592
593
594#if VFS_PAGES_IO_TRACING
595
596namespace VFSPagesIOTracing {
597
598class PagesIOTraceEntry : public AbstractTraceEntry {
599protected:
600	PagesIOTraceEntry(struct vnode* vnode, void* cookie, off_t pos,
601		const generic_io_vec* vecs, uint32 count, uint32 flags,
602		generic_size_t bytesRequested, status_t status,
603		generic_size_t bytesTransferred)
604		:
605		fVnode(vnode),
606		fMountID(vnode->mount->id),
607		fNodeID(vnode->id),
608		fCookie(cookie),
609		fPos(pos),
610		fCount(count),
611		fFlags(flags),
612		fBytesRequested(bytesRequested),
613		fStatus(status),
614		fBytesTransferred(bytesTransferred)
615	{
616		fVecs = (generic_io_vec*)alloc_tracing_buffer_memcpy(vecs,
617			sizeof(generic_io_vec) * count, false);
618	}
619
620	void AddDump(TraceOutput& out, const char* mode)
621	{
622		out.Print("vfs pages io %5s: vnode: %p (%" B_PRId32 ", %" B_PRId64 "), "
623			"cookie: %p, pos: %" B_PRIdOFF ", size: %" B_PRIu64 ", vecs: {",
624			mode, fVnode, fMountID, fNodeID, fCookie, fPos,
625			(uint64)fBytesRequested);
626
627		if (fVecs != NULL) {
628			for (uint32 i = 0; i < fCount; i++) {
629				if (i > 0)
630					out.Print(", ");
631				out.Print("(%" B_PRIx64 ", %" B_PRIu64 ")", (uint64)fVecs[i].base,
632					(uint64)fVecs[i].length);
633			}
634		}
635
636		out.Print("}, flags: %#" B_PRIx32 " -> status: %#" B_PRIx32 ", "
637			"transferred: %" B_PRIu64, fFlags, fStatus,
638			(uint64)fBytesTransferred);
639	}
640
641protected:
642	struct vnode*	fVnode;
643	dev_t			fMountID;
644	ino_t			fNodeID;
645	void*			fCookie;
646	off_t			fPos;
647	generic_io_vec*	fVecs;
648	uint32			fCount;
649	uint32			fFlags;
650	generic_size_t	fBytesRequested;
651	status_t		fStatus;
652	generic_size_t	fBytesTransferred;
653};
654
655
656class ReadPages : public PagesIOTraceEntry {
657public:
658	ReadPages(struct vnode* vnode, void* cookie, off_t pos,
659		const generic_io_vec* vecs, uint32 count, uint32 flags,
660		generic_size_t bytesRequested, status_t status,
661		generic_size_t bytesTransferred)
662		:
663		PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
664			bytesRequested, status, bytesTransferred)
665	{
666		Initialized();
667	}
668
669	virtual void AddDump(TraceOutput& out)
670	{
671		PagesIOTraceEntry::AddDump(out, "read");
672	}
673};
674
675
676class WritePages : public PagesIOTraceEntry {
677public:
678	WritePages(struct vnode* vnode, void* cookie, off_t pos,
679		const generic_io_vec* vecs, uint32 count, uint32 flags,
680		generic_size_t bytesRequested, status_t status,
681		generic_size_t bytesTransferred)
682		:
683		PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
684			bytesRequested, status, bytesTransferred)
685	{
686		Initialized();
687	}
688
689	virtual void AddDump(TraceOutput& out)
690	{
691		PagesIOTraceEntry::AddDump(out, "write");
692	}
693};
694
695}	// namespace VFSPagesIOTracing
696
697#	define TPIO(x) new(std::nothrow) VFSPagesIOTracing::x;
698#else
699#	define TPIO(x) ;
700#endif	// VFS_PAGES_IO_TRACING
701
702
703/*! Finds the mounted device (the fs_mount structure) with the given ID.
704	Note, you must hold the sMountLock lock when you call this function.
705*/
706static struct fs_mount*
707find_mount(dev_t id)
708{
709	ASSERT_READ_LOCKED_RW_LOCK(&sMountLock);
710
711	return sMountsTable->Lookup(id);
712}
713
714
715static status_t
716get_mount(dev_t id, struct fs_mount** _mount)
717{
718	struct fs_mount* mount;
719
720	ReadLocker nodeLocker(sVnodeLock);
721	ReadLocker mountLocker(sMountLock);
722
723	mount = find_mount(id);
724	if (mount == NULL)
725		return B_BAD_VALUE;
726
727	struct vnode* rootNode = mount->root_vnode;
728	if (mount->unmounting || rootNode == NULL || rootNode->IsBusy()
729		|| rootNode->ref_count == 0) {
730		// might have been called during a mount/unmount operation
731		return B_BUSY;
732	}
733
734	inc_vnode_ref_count(rootNode);
735	*_mount = mount;
736	return B_OK;
737}
738
739
740static void
741put_mount(struct fs_mount* mount)
742{
743	if (mount)
744		put_vnode(mount->root_vnode);
745}
746
747
748/*!	Tries to open the specified file system module.
749	Accepts a file system name of the form "bfs" or "file_systems/bfs/v1".
750	Returns a pointer to file system module interface, or NULL if it
751	could not open the module.
752*/
753static file_system_module_info*
754get_file_system(const char* fsName)
755{
756	char name[B_FILE_NAME_LENGTH];
757	if (strncmp(fsName, "file_systems/", strlen("file_systems/"))) {
758		// construct module name if we didn't get one
759		// (we currently support only one API)
760		snprintf(name, sizeof(name), "file_systems/%s/v1", fsName);
761		fsName = NULL;
762	}
763
764	file_system_module_info* info;
765	if (get_module(fsName ? fsName : name, (module_info**)&info) != B_OK)
766		return NULL;
767
768	return info;
769}
770
771
772/*!	Accepts a file system name of the form "bfs" or "file_systems/bfs/v1"
773	and returns a compatible fs_info.fsh_name name ("bfs" in both cases).
774	The name is allocated for you, and you have to free() it when you're
775	done with it.
776	Returns NULL if the required memory is not available.
777*/
778static char*
779get_file_system_name(const char* fsName)
780{
781	const size_t length = strlen("file_systems/");
782
783	if (strncmp(fsName, "file_systems/", length)) {
784		// the name already seems to be the module's file name
785		return strdup(fsName);
786	}
787
788	fsName += length;
789	const char* end = strchr(fsName, '/');
790	if (end == NULL) {
791		// this doesn't seem to be a valid name, but well...
792		return strdup(fsName);
793	}
794
795	// cut off the trailing /v1
796
797	char* name = (char*)malloc(end + 1 - fsName);
798	if (name == NULL)
799		return NULL;
800
801	strlcpy(name, fsName, end + 1 - fsName);
802	return name;
803}
804
805
806/*!	Accepts a list of file system names separated by a colon, one for each
807	layer and returns the file system name for the specified layer.
808	The name is allocated for you, and you have to free() it when you're
809	done with it.
810	Returns NULL if the required memory is not available or if there is no
811	name for the specified layer.
812*/
813static char*
814get_file_system_name_for_layer(const char* fsNames, int32 layer)
815{
816	while (layer >= 0) {
817		const char* end = strchr(fsNames, ':');
818		if (end == NULL) {
819			if (layer == 0)
820				return strdup(fsNames);
821			return NULL;
822		}
823
824		if (layer == 0) {
825			size_t length = end - fsNames + 1;
826			char* result = (char*)malloc(length);
827			strlcpy(result, fsNames, length);
828			return result;
829		}
830
831		fsNames = end + 1;
832		layer--;
833	}
834
835	return NULL;
836}
837
838
839static void
840add_vnode_to_mount_list(struct vnode* vnode, struct fs_mount* mount)
841{
842	MutexLocker _(mount->lock);
843	mount->vnodes.Add(vnode);
844}
845
846
847static void
848remove_vnode_from_mount_list(struct vnode* vnode, struct fs_mount* mount)
849{
850	MutexLocker _(mount->lock);
851	mount->vnodes.Remove(vnode);
852}
853
854
855/*!	\brief Looks up a vnode by mount and node ID in the sVnodeTable.
856
857	The caller must hold the sVnodeLock (read lock at least).
858
859	\param mountID the mount ID.
860	\param vnodeID the node ID.
861
862	\return The vnode structure, if it was found in the hash table, \c NULL
863			otherwise.
864*/
865static struct vnode*
866lookup_vnode(dev_t mountID, ino_t vnodeID)
867{
868	ASSERT_READ_LOCKED_RW_LOCK(&sVnodeLock);
869
870	struct vnode_hash_key key;
871
872	key.device = mountID;
873	key.vnode = vnodeID;
874
875	return sVnodeTable->Lookup(key);
876}
877
878
879/*!	\brief Checks whether or not a busy vnode should be waited for (again).
880
881	This will also wait for BUSY_VNODE_DELAY before returning if one should
882	still wait for the vnode becoming unbusy.
883
884	\return \c true if one should retry, \c false if not.
885*/
886static bool
887retry_busy_vnode(int32& tries, dev_t mountID, ino_t vnodeID)
888{
889	if (--tries < 0) {
890		// vnode doesn't seem to become unbusy
891		dprintf("vnode %" B_PRIdDEV ":%" B_PRIdINO
892			" is not becoming unbusy!\n", mountID, vnodeID);
893		return false;
894	}
895	snooze(BUSY_VNODE_DELAY);
896	return true;
897}
898
899
900/*!	Creates a new vnode with the given mount and node ID.
901	If the node already exists, it is returned instead and no new node is
902	created. In either case -- but not, if an error occurs -- the function write
903	locks \c sVnodeLock and keeps it locked for the caller when returning. On
904	error the lock is not held on return.
905
906	\param mountID The mount ID.
907	\param vnodeID The vnode ID.
908	\param _vnode Will be set to the new vnode on success.
909	\param _nodeCreated Will be set to \c true when the returned vnode has
910		been newly created, \c false when it already existed. Will not be
911		changed on error.
912	\return \c B_OK, when the vnode was successfully created and inserted or
913		a node with the given ID was found, \c B_NO_MEMORY or
914		\c B_ENTRY_NOT_FOUND on error.
915*/
916static status_t
917create_new_vnode_and_lock(dev_t mountID, ino_t vnodeID, struct vnode*& _vnode,
918	bool& _nodeCreated)
919{
920	FUNCTION(("create_new_vnode_and_lock()\n"));
921
922	struct vnode* vnode = (struct vnode*)object_cache_alloc(sVnodeCache, 0);
923	if (vnode == NULL)
924		return B_NO_MEMORY;
925
926	// initialize basic values
927	memset(vnode, 0, sizeof(struct vnode));
928	vnode->device = mountID;
929	vnode->id = vnodeID;
930	vnode->ref_count = 1;
931	vnode->SetBusy(true);
932
933	// look up the node -- it might have been added by someone else in the
934	// meantime
935	rw_lock_write_lock(&sVnodeLock);
936	struct vnode* existingVnode = lookup_vnode(mountID, vnodeID);
937	if (existingVnode != NULL) {
938		object_cache_free(sVnodeCache, vnode, 0);
939		_vnode = existingVnode;
940		_nodeCreated = false;
941		return B_OK;
942	}
943
944	// get the mount structure
945	rw_lock_read_lock(&sMountLock);
946	vnode->mount = find_mount(mountID);
947	if (!vnode->mount || vnode->mount->unmounting) {
948		rw_lock_read_unlock(&sMountLock);
949		rw_lock_write_unlock(&sVnodeLock);
950		object_cache_free(sVnodeCache, vnode, 0);
951		return B_ENTRY_NOT_FOUND;
952	}
953
954	// add the vnode to the mount's node list and the hash table
955	sVnodeTable->Insert(vnode);
956	add_vnode_to_mount_list(vnode, vnode->mount);
957
958	rw_lock_read_unlock(&sMountLock);
959
960	_vnode = vnode;
961	_nodeCreated = true;
962
963	// keep the vnode lock locked
964	return B_OK;
965}
966
967
968/*!	Frees the vnode and all resources it has acquired, and removes
969	it from the vnode hash as well as from its mount structure.
970	Will also make sure that any cache modifications are written back.
971*/
972static void
973free_vnode(struct vnode* vnode, bool reenter)
974{
975	ASSERT_PRINT(vnode->ref_count == 0 && vnode->IsBusy(), "vnode: %p\n",
976		vnode);
977	ASSERT_PRINT(vnode->advisory_locking == NULL, "vnode: %p\n", vnode);
978
979	// write back any changes in this vnode's cache -- but only
980	// if the vnode won't be deleted, in which case the changes
981	// will be discarded
982
983	if (!vnode->IsRemoved() && HAS_FS_CALL(vnode, fsync))
984		FS_CALL_NO_PARAMS(vnode, fsync);
985
986	// Note: If this vnode has a cache attached, there will still be two
987	// references to that cache at this point. The last one belongs to the vnode
988	// itself (cf. vfs_get_vnode_cache()) and one belongs to the node's file
989	// cache. Each but the last reference to a cache also includes a reference
990	// to the vnode. The file cache, however, released its reference (cf.
991	// file_cache_create()), so that this vnode's ref count has the chance to
992	// ever drop to 0. Deleting the file cache now, will cause the next to last
993	// cache reference to be released, which will also release a (no longer
994	// existing) vnode reference. To avoid problems, we set the vnode's ref
995	// count, so that it will neither become negative nor 0.
996	vnode->ref_count = 2;
997
998	if (!vnode->IsUnpublished()) {
999		if (vnode->IsRemoved())
1000			FS_CALL(vnode, remove_vnode, reenter);
1001		else
1002			FS_CALL(vnode, put_vnode, reenter);
1003	}
1004
1005	// If the vnode has a VMCache attached, make sure that it won't try to get
1006	// another reference via VMVnodeCache::AcquireUnreferencedStoreRef(). As
1007	// long as the vnode is busy and in the hash, that won't happen, but as
1008	// soon as we've removed it from the hash, it could reload the vnode -- with
1009	// a new cache attached!
1010	if (vnode->cache != NULL && vnode->cache->type == CACHE_TYPE_VNODE)
1011		((VMVnodeCache*)vnode->cache)->VnodeDeleted();
1012
1013	// The file system has removed the resources of the vnode now, so we can
1014	// make it available again (by removing the busy vnode from the hash).
1015	rw_lock_write_lock(&sVnodeLock);
1016	sVnodeTable->Remove(vnode);
1017	rw_lock_write_unlock(&sVnodeLock);
1018
1019	// if we have a VMCache attached, remove it
1020	if (vnode->cache)
1021		vnode->cache->ReleaseRef();
1022
1023	vnode->cache = NULL;
1024
1025	remove_vnode_from_mount_list(vnode, vnode->mount);
1026
1027	object_cache_free(sVnodeCache, vnode, 0);
1028}
1029
1030
1031/*!	\brief Decrements the reference counter of the given vnode and deletes it,
1032	if the counter dropped to 0.
1033
1034	The caller must, of course, own a reference to the vnode to call this
1035	function.
1036	The caller must not hold the sVnodeLock or the sMountLock.
1037
1038	\param vnode the vnode.
1039	\param alwaysFree don't move this vnode into the unused list, but really
1040		   delete it if possible.
1041	\param reenter \c true, if this function is called (indirectly) from within
1042		   a file system. This will be passed to file system hooks only.
1043	\return \c B_OK, if everything went fine, an error code otherwise.
1044*/
1045static status_t
1046dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree, bool reenter)
1047{
1048	ReadLocker locker(sVnodeLock);
1049	AutoLocker<Vnode> nodeLocker(vnode);
1050
1051	int32 oldRefCount = atomic_add(&vnode->ref_count, -1);
1052
1053	ASSERT_PRINT(oldRefCount > 0, "vnode %p\n", vnode);
1054
1055	TRACE(("dec_vnode_ref_count: vnode %p, ref now %" B_PRId32 "\n", vnode,
1056		vnode->ref_count));
1057
1058	if (oldRefCount != 1)
1059		return B_OK;
1060
1061	if (vnode->IsBusy())
1062		panic("dec_vnode_ref_count: called on busy vnode %p\n", vnode);
1063
1064	bool freeNode = false;
1065	bool freeUnusedNodes = false;
1066
1067	// Just insert the vnode into an unused list if we don't need
1068	// to delete it
1069	if (vnode->IsRemoved() || alwaysFree) {
1070		vnode_to_be_freed(vnode);
1071		vnode->SetBusy(true);
1072		freeNode = true;
1073	} else
1074		freeUnusedNodes = vnode_unused(vnode);
1075
1076	nodeLocker.Unlock();
1077	locker.Unlock();
1078
1079	if (freeNode)
1080		free_vnode(vnode, reenter);
1081	else if (freeUnusedNodes)
1082		free_unused_vnodes();
1083
1084	return B_OK;
1085}
1086
1087
1088/*!	\brief Increments the reference counter of the given vnode.
1089
1090	The caller must make sure that the node isn't deleted while this function
1091	is called. This can be done either:
1092	- by ensuring that a reference to the node exists and remains in existence,
1093	  or
1094	- by holding the vnode's lock (which also requires read locking sVnodeLock)
1095	  or by holding sVnodeLock write locked.
1096
1097	In the second case the caller is responsible for dealing with the ref count
1098	0 -> 1 transition. That is 1. this function must not be invoked when the
1099	node is busy in the first place and 2. vnode_used() must be called for the
1100	node.
1101
1102	\param vnode the vnode.
1103*/
1104static void
1105inc_vnode_ref_count(struct vnode* vnode)
1106{
1107	atomic_add(&vnode->ref_count, 1);
1108	TRACE(("inc_vnode_ref_count: vnode %p, ref now %" B_PRId32 "\n", vnode,
1109		vnode->ref_count));
1110}
1111
1112
1113static bool
1114is_special_node_type(int type)
1115{
1116	// at the moment only FIFOs are supported
1117	return S_ISFIFO(type);
1118}
1119
1120
1121static status_t
1122create_special_sub_node(struct vnode* vnode, uint32 flags)
1123{
1124	if (S_ISFIFO(vnode->Type()))
1125		return create_fifo_vnode(vnode->mount->volume, vnode);
1126
1127	return B_BAD_VALUE;
1128}
1129
1130
1131/*!	\brief Retrieves a vnode for a given mount ID, node ID pair.
1132
1133	If the node is not yet in memory, it will be loaded.
1134
1135	The caller must not hold the sVnodeLock or the sMountLock.
1136
1137	\param mountID the mount ID.
1138	\param vnodeID the node ID.
1139	\param _vnode Pointer to a vnode* variable into which the pointer to the
1140		   retrieved vnode structure shall be written.
1141	\param reenter \c true, if this function is called (indirectly) from within
1142		   a file system.
1143	\return \c B_OK, if everything when fine, an error code otherwise.
1144*/
1145static status_t
1146get_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode, bool canWait,
1147	int reenter)
1148{
1149	FUNCTION(("get_vnode: mountid %" B_PRId32 " vnid 0x%" B_PRIx64 " %p\n",
1150		mountID, vnodeID, _vnode));
1151
1152	rw_lock_read_lock(&sVnodeLock);
1153
1154	int32 tries = BUSY_VNODE_RETRIES;
1155restart:
1156	struct vnode* vnode = lookup_vnode(mountID, vnodeID);
1157	AutoLocker<Vnode> nodeLocker(vnode);
1158
1159	if (vnode && vnode->IsBusy()) {
1160		// vnodes in the Removed state (except ones still Unpublished)
1161		// which are also Busy will disappear soon, so we do not wait for them.
1162		const bool doNotWait = vnode->IsRemoved() && !vnode->IsUnpublished();
1163
1164		nodeLocker.Unlock();
1165		rw_lock_read_unlock(&sVnodeLock);
1166		if (!canWait) {
1167			dprintf("vnode %" B_PRIdDEV ":%" B_PRIdINO " is busy!\n",
1168				mountID, vnodeID);
1169			return B_BUSY;
1170		}
1171		if (doNotWait || !retry_busy_vnode(tries, mountID, vnodeID))
1172			return B_BUSY;
1173
1174		rw_lock_read_lock(&sVnodeLock);
1175		goto restart;
1176	}
1177
1178	TRACE(("get_vnode: tried to lookup vnode, got %p\n", vnode));
1179
1180	status_t status;
1181
1182	if (vnode) {
1183		if (vnode->ref_count == 0) {
1184			// this vnode has been unused before
1185			vnode_used(vnode);
1186		}
1187		inc_vnode_ref_count(vnode);
1188
1189		nodeLocker.Unlock();
1190		rw_lock_read_unlock(&sVnodeLock);
1191	} else {
1192		// we need to create a new vnode and read it in
1193		rw_lock_read_unlock(&sVnodeLock);
1194			// unlock -- create_new_vnode_and_lock() write-locks on success
1195		bool nodeCreated;
1196		status = create_new_vnode_and_lock(mountID, vnodeID, vnode,
1197			nodeCreated);
1198		if (status != B_OK)
1199			return status;
1200
1201		if (!nodeCreated) {
1202			rw_lock_read_lock(&sVnodeLock);
1203			rw_lock_write_unlock(&sVnodeLock);
1204			goto restart;
1205		}
1206
1207		rw_lock_write_unlock(&sVnodeLock);
1208
1209		int type;
1210		uint32 flags;
1211		status = FS_MOUNT_CALL(vnode->mount, get_vnode, vnodeID, vnode, &type,
1212			&flags, reenter);
1213		if (status == B_OK && vnode->private_node == NULL)
1214			status = B_BAD_VALUE;
1215
1216		bool gotNode = status == B_OK;
1217		bool publishSpecialSubNode = false;
1218		if (gotNode) {
1219			vnode->SetType(type);
1220			publishSpecialSubNode = is_special_node_type(type)
1221				&& (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
1222		}
1223
1224		if (gotNode && publishSpecialSubNode)
1225			status = create_special_sub_node(vnode, flags);
1226
1227		if (status != B_OK) {
1228			if (gotNode)
1229				FS_CALL(vnode, put_vnode, reenter);
1230
1231			rw_lock_write_lock(&sVnodeLock);
1232			sVnodeTable->Remove(vnode);
1233			remove_vnode_from_mount_list(vnode, vnode->mount);
1234			rw_lock_write_unlock(&sVnodeLock);
1235
1236			object_cache_free(sVnodeCache, vnode, 0);
1237			return status;
1238		}
1239
1240		rw_lock_read_lock(&sVnodeLock);
1241		vnode->Lock();
1242
1243		vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
1244		vnode->SetBusy(false);
1245
1246		vnode->Unlock();
1247		rw_lock_read_unlock(&sVnodeLock);
1248	}
1249
1250	TRACE(("get_vnode: returning %p\n", vnode));
1251
1252	*_vnode = vnode;
1253	return B_OK;
1254}
1255
1256
1257/*!	\brief Decrements the reference counter of the given vnode and deletes it,
1258	if the counter dropped to 0.
1259
1260	The caller must, of course, own a reference to the vnode to call this
1261	function.
1262	The caller must not hold the sVnodeLock or the sMountLock.
1263
1264	\param vnode the vnode.
1265*/
1266static inline void
1267put_vnode(struct vnode* vnode)
1268{
1269	dec_vnode_ref_count(vnode, false, false);
1270}
1271
1272
1273static void
1274free_unused_vnodes(int32 level)
1275{
1276	unused_vnodes_check_started();
1277
1278	if (level == B_NO_LOW_RESOURCE) {
1279		unused_vnodes_check_done();
1280		return;
1281	}
1282
1283	flush_hot_vnodes();
1284
1285	// determine how many nodes to free
1286	uint32 count = 1;
1287	{
1288		MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1289
1290		switch (level) {
1291			case B_LOW_RESOURCE_NOTE:
1292				count = sUnusedVnodes / 100;
1293				break;
1294			case B_LOW_RESOURCE_WARNING:
1295				count = sUnusedVnodes / 10;
1296				break;
1297			case B_LOW_RESOURCE_CRITICAL:
1298				count = sUnusedVnodes;
1299				break;
1300		}
1301
1302		if (count > sUnusedVnodes)
1303			count = sUnusedVnodes;
1304	}
1305
1306	// Write back the modified pages of some unused vnodes and free them.
1307
1308	for (uint32 i = 0; i < count; i++) {
1309		ReadLocker vnodesReadLocker(sVnodeLock);
1310
1311		// get the first node
1312		MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1313		struct vnode* vnode = (struct vnode*)list_get_first_item(
1314			&sUnusedVnodeList);
1315		unusedVnodesLocker.Unlock();
1316
1317		if (vnode == NULL)
1318			break;
1319
1320		// lock the node
1321		AutoLocker<Vnode> nodeLocker(vnode);
1322
1323		// Check whether the node is still unused -- since we only append to the
1324		// tail of the unused queue, the vnode should still be at its head.
1325		// Alternatively we could check its ref count for 0 and its busy flag,
1326		// but if the node is no longer at the head of the queue, it means it
1327		// has been touched in the meantime, i.e. it is no longer the least
1328		// recently used unused vnode and we rather don't free it.
1329		unusedVnodesLocker.Lock();
1330		if (vnode != list_get_first_item(&sUnusedVnodeList))
1331			continue;
1332		unusedVnodesLocker.Unlock();
1333
1334		ASSERT(!vnode->IsBusy());
1335
1336		// grab a reference
1337		inc_vnode_ref_count(vnode);
1338		vnode_used(vnode);
1339
1340		// write back changes and free the node
1341		nodeLocker.Unlock();
1342		vnodesReadLocker.Unlock();
1343
1344		if (vnode->cache != NULL)
1345			vnode->cache->WriteModified();
1346
1347		dec_vnode_ref_count(vnode, true, false);
1348			// this should free the vnode when it's still unused
1349	}
1350
1351	unused_vnodes_check_done();
1352}
1353
1354
1355/*!	Gets the vnode the given vnode is covering.
1356
1357	The caller must have \c sVnodeLock read-locked at least.
1358
1359	The function returns a reference to the retrieved vnode (if any), the caller
1360	is responsible to free.
1361
1362	\param vnode The vnode whose covered node shall be returned.
1363	\return The covered vnode, or \c NULL if the given vnode doesn't cover any
1364		vnode.
1365*/
1366static inline Vnode*
1367get_covered_vnode_locked(Vnode* vnode)
1368{
1369	if (Vnode* coveredNode = vnode->covers) {
1370		while (coveredNode->covers != NULL)
1371			coveredNode = coveredNode->covers;
1372
1373		inc_vnode_ref_count(coveredNode);
1374		return coveredNode;
1375	}
1376
1377	return NULL;
1378}
1379
1380
1381/*!	Gets the vnode the given vnode is covering.
1382
1383	The caller must not hold \c sVnodeLock. Note that this implies a race
1384	condition, since the situation can change at any time.
1385
1386	The function returns a reference to the retrieved vnode (if any), the caller
1387	is responsible to free.
1388
1389	\param vnode The vnode whose covered node shall be returned.
1390	\return The covered vnode, or \c NULL if the given vnode doesn't cover any
1391		vnode.
1392*/
1393static inline Vnode*
1394get_covered_vnode(Vnode* vnode)
1395{
1396	if (!vnode->IsCovering())
1397		return NULL;
1398
1399	ReadLocker vnodeReadLocker(sVnodeLock);
1400	return get_covered_vnode_locked(vnode);
1401}
1402
1403
1404/*!	Gets the vnode the given vnode is covered by.
1405
1406	The caller must have \c sVnodeLock read-locked at least.
1407
1408	The function returns a reference to the retrieved vnode (if any), the caller
1409	is responsible to free.
1410
1411	\param vnode The vnode whose covering node shall be returned.
1412	\return The covering vnode, or \c NULL if the given vnode isn't covered by
1413		any vnode.
1414*/
1415static Vnode*
1416get_covering_vnode_locked(Vnode* vnode)
1417{
1418	if (Vnode* coveringNode = vnode->covered_by) {
1419		while (coveringNode->covered_by != NULL)
1420			coveringNode = coveringNode->covered_by;
1421
1422		inc_vnode_ref_count(coveringNode);
1423		return coveringNode;
1424	}
1425
1426	return NULL;
1427}
1428
1429
1430/*!	Gets the vnode the given vnode is covered by.
1431
1432	The caller must not hold \c sVnodeLock. Note that this implies a race
1433	condition, since the situation can change at any time.
1434
1435	The function returns a reference to the retrieved vnode (if any), the caller
1436	is responsible to free.
1437
1438	\param vnode The vnode whose covering node shall be returned.
1439	\return The covering vnode, or \c NULL if the given vnode isn't covered by
1440		any vnode.
1441*/
1442static inline Vnode*
1443get_covering_vnode(Vnode* vnode)
1444{
1445	if (!vnode->IsCovered())
1446		return NULL;
1447
1448	ReadLocker vnodeReadLocker(sVnodeLock);
1449	return get_covering_vnode_locked(vnode);
1450}
1451
1452
1453static void
1454free_unused_vnodes()
1455{
1456	free_unused_vnodes(
1457		low_resource_state(B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1458			| B_KERNEL_RESOURCE_ADDRESS_SPACE));
1459}
1460
1461
1462static void
1463vnode_low_resource_handler(void* /*data*/, uint32 resources, int32 level)
1464{
1465	TRACE(("vnode_low_resource_handler(level = %" B_PRId32 ")\n", level));
1466
1467	free_unused_vnodes(level);
1468}
1469
1470
1471static inline void
1472put_advisory_locking(struct advisory_locking* locking)
1473{
1474	release_sem(locking->lock);
1475}
1476
1477
1478/*!	Returns the advisory_locking object of the \a vnode in case it
1479	has one, and locks it.
1480	You have to call put_advisory_locking() when you're done with
1481	it.
1482	Note, you must not have the vnode mutex locked when calling
1483	this function.
1484*/
1485static struct advisory_locking*
1486get_advisory_locking(struct vnode* vnode)
1487{
1488	rw_lock_read_lock(&sVnodeLock);
1489	vnode->Lock();
1490
1491	struct advisory_locking* locking = vnode->advisory_locking;
1492	sem_id lock = locking != NULL ? locking->lock : B_ERROR;
1493
1494	vnode->Unlock();
1495	rw_lock_read_unlock(&sVnodeLock);
1496
1497	if (lock >= 0)
1498		lock = acquire_sem(lock);
1499	if (lock < 0) {
1500		// This means the locking has been deleted in the mean time
1501		// or had never existed in the first place - otherwise, we
1502		// would get the lock at some point.
1503		return NULL;
1504	}
1505
1506	return locking;
1507}
1508
1509
1510/*!	Creates a locked advisory_locking object, and attaches it to the
1511	given \a vnode.
1512	Returns B_OK in case of success - also if the vnode got such an
1513	object from someone else in the mean time, you'll still get this
1514	one locked then.
1515*/
1516static status_t
1517create_advisory_locking(struct vnode* vnode)
1518{
1519	if (vnode == NULL)
1520		return B_FILE_ERROR;
1521
1522	ObjectDeleter<advisory_locking> lockingDeleter;
1523	struct advisory_locking* locking = NULL;
1524
1525	while (get_advisory_locking(vnode) == NULL) {
1526		// no locking object set on the vnode yet, create one
1527		if (locking == NULL) {
1528			locking = new(std::nothrow) advisory_locking;
1529			if (locking == NULL)
1530				return B_NO_MEMORY;
1531			lockingDeleter.SetTo(locking);
1532
1533			locking->wait_sem = create_sem(0, "advisory lock");
1534			if (locking->wait_sem < 0)
1535				return locking->wait_sem;
1536
1537			locking->lock = create_sem(0, "advisory locking");
1538			if (locking->lock < 0)
1539				return locking->lock;
1540		}
1541
1542		// set our newly created locking object
1543		ReadLocker _(sVnodeLock);
1544		AutoLocker<Vnode> nodeLocker(vnode);
1545		if (vnode->advisory_locking == NULL) {
1546			vnode->advisory_locking = locking;
1547			lockingDeleter.Detach();
1548			return B_OK;
1549		}
1550	}
1551
1552	// The vnode already had a locking object. That's just as well.
1553
1554	return B_OK;
1555}
1556
1557
1558/*! Returns \c true when either \a flock is \c NULL or the \a flock intersects
1559	with the advisory_lock \a lock.
1560*/
1561static bool
1562advisory_lock_intersects(struct advisory_lock* lock, struct flock* flock)
1563{
1564	if (flock == NULL)
1565		return true;
1566
1567	return lock->start <= flock->l_start - 1 + flock->l_len
1568		&& lock->end >= flock->l_start;
1569}
1570
1571
1572/*!	Tests whether acquiring a lock would block.
1573*/
1574static status_t
1575test_advisory_lock(struct vnode* vnode, struct flock* flock)
1576{
1577	flock->l_type = F_UNLCK;
1578
1579	struct advisory_locking* locking = get_advisory_locking(vnode);
1580	if (locking == NULL)
1581		return B_OK;
1582
1583	team_id team = team_get_current_team_id();
1584
1585	LockList::Iterator iterator = locking->locks.GetIterator();
1586	while (iterator.HasNext()) {
1587		struct advisory_lock* lock = iterator.Next();
1588
1589		 if (lock->team != team && advisory_lock_intersects(lock, flock)) {
1590			// locks do overlap
1591			if (flock->l_type != F_RDLCK || !lock->shared) {
1592				// collision
1593				flock->l_type = lock->shared ? F_RDLCK : F_WRLCK;
1594				flock->l_whence = SEEK_SET;
1595				flock->l_start = lock->start;
1596				flock->l_len = lock->end - lock->start + 1;
1597				flock->l_pid = lock->team;
1598				break;
1599			}
1600		}
1601	}
1602
1603	put_advisory_locking(locking);
1604	return B_OK;
1605}
1606
1607
1608/*!	Removes the specified lock, or all locks of the calling team
1609	if \a flock is NULL.
1610*/
1611static status_t
1612release_advisory_lock(struct vnode* vnode, struct io_context* context,
1613	struct file_descriptor* descriptor, struct flock* flock)
1614{
1615	FUNCTION(("release_advisory_lock(vnode = %p, flock = %p)\n", vnode, flock));
1616
1617	struct advisory_locking* locking = get_advisory_locking(vnode);
1618	if (locking == NULL)
1619		return B_OK;
1620
1621	// find matching lock entries
1622
1623	LockList::Iterator iterator = locking->locks.GetIterator();
1624	while (iterator.HasNext()) {
1625		struct advisory_lock* lock = iterator.Next();
1626		bool removeLock = false;
1627
1628		if (descriptor != NULL && lock->bound_to == descriptor) {
1629			// Remove flock() locks
1630			removeLock = true;
1631		} else if (lock->bound_to == context
1632				&& advisory_lock_intersects(lock, flock)) {
1633			// Remove POSIX locks
1634			bool endsBeyond = false;
1635			bool startsBefore = false;
1636			if (flock != NULL) {
1637				startsBefore = lock->start < flock->l_start;
1638				endsBeyond = lock->end > flock->l_start - 1 + flock->l_len;
1639			}
1640
1641			if (!startsBefore && !endsBeyond) {
1642				// lock is completely contained in flock
1643				removeLock = true;
1644			} else if (startsBefore && !endsBeyond) {
1645				// cut the end of the lock
1646				lock->end = flock->l_start - 1;
1647			} else if (!startsBefore && endsBeyond) {
1648				// cut the start of the lock
1649				lock->start = flock->l_start + flock->l_len;
1650			} else {
1651				// divide the lock into two locks
1652				struct advisory_lock* secondLock = new advisory_lock;
1653				if (secondLock == NULL) {
1654					// TODO: we should probably revert the locks we already
1655					// changed... (ie. allocate upfront)
1656					put_advisory_locking(locking);
1657					return B_NO_MEMORY;
1658				}
1659
1660				lock->end = flock->l_start - 1;
1661
1662				secondLock->bound_to = context;
1663				secondLock->team = lock->team;
1664				secondLock->session = lock->session;
1665				// values must already be normalized when getting here
1666				secondLock->start = flock->l_start + flock->l_len;
1667				secondLock->end = lock->end;
1668				secondLock->shared = lock->shared;
1669
1670				locking->locks.Add(secondLock);
1671			}
1672		}
1673
1674		if (removeLock) {
1675			// this lock is no longer used
1676			iterator.Remove();
1677			delete lock;
1678		}
1679	}
1680
1681	bool removeLocking = locking->locks.IsEmpty();
1682	release_sem_etc(locking->wait_sem, 1, B_RELEASE_ALL);
1683
1684	put_advisory_locking(locking);
1685
1686	if (removeLocking) {
1687		// We can remove the whole advisory locking structure; it's no
1688		// longer used
1689		locking = get_advisory_locking(vnode);
1690		if (locking != NULL) {
1691			ReadLocker locker(sVnodeLock);
1692			AutoLocker<Vnode> nodeLocker(vnode);
1693
1694			// the locking could have been changed in the mean time
1695			if (locking->locks.IsEmpty()) {
1696				vnode->advisory_locking = NULL;
1697				nodeLocker.Unlock();
1698				locker.Unlock();
1699
1700				// we've detached the locking from the vnode, so we can
1701				// safely delete it
1702				delete locking;
1703			} else {
1704				// the locking is in use again
1705				nodeLocker.Unlock();
1706				locker.Unlock();
1707				release_sem_etc(locking->lock, 1, B_DO_NOT_RESCHEDULE);
1708			}
1709		}
1710	}
1711
1712	return B_OK;
1713}
1714
1715
1716/*!	Acquires an advisory lock for the \a vnode. If \a wait is \c true, it
1717	will wait for the lock to become available, if there are any collisions
1718	(it will return B_PERMISSION_DENIED in this case if \a wait is \c false).
1719
1720	If \a descriptor is NULL, POSIX semantics are used for this lock. Otherwise,
1721	BSD flock() semantics are used, that is, all children can unlock the file
1722	in question (we even allow parents to remove the lock, though, but that
1723	seems to be in line to what the BSD's are doing).
1724*/
1725static status_t
1726acquire_advisory_lock(struct vnode* vnode, io_context* context,
1727	struct file_descriptor* descriptor, struct flock* flock, bool wait)
1728{
1729	FUNCTION(("acquire_advisory_lock(vnode = %p, flock = %p, wait = %s)\n",
1730		vnode, flock, wait ? "yes" : "no"));
1731
1732	bool shared = flock->l_type == F_RDLCK;
1733	void* boundTo = descriptor != NULL ? (void*)descriptor : (void*)context;
1734	status_t status = B_OK;
1735
1736	// TODO: do deadlock detection!
1737
1738	struct advisory_locking* locking;
1739
1740	while (true) {
1741		// if this vnode has an advisory_locking structure attached,
1742		// lock that one and search for any colliding file lock
1743		status = create_advisory_locking(vnode);
1744		if (status != B_OK)
1745			return status;
1746
1747		locking = vnode->advisory_locking;
1748		team_id team = team_get_current_team_id();
1749		sem_id waitForLock = -1;
1750
1751		// test for collisions
1752		LockList::Iterator iterator = locking->locks.GetIterator();
1753		while (iterator.HasNext()) {
1754			struct advisory_lock* lock = iterator.Next();
1755
1756			// TODO: locks from the same team might be joinable!
1757			if ((lock->team != team || lock->bound_to != boundTo)
1758					&& advisory_lock_intersects(lock, flock)) {
1759				// locks do overlap
1760				if (!shared || !lock->shared) {
1761					// we need to wait
1762					waitForLock = locking->wait_sem;
1763					break;
1764				}
1765			}
1766		}
1767
1768		if (waitForLock < 0)
1769			break;
1770
1771		// We need to wait. Do that or fail now, if we've been asked not to.
1772
1773		if (!wait) {
1774			put_advisory_locking(locking);
1775			return descriptor != NULL ? B_WOULD_BLOCK : B_PERMISSION_DENIED;
1776		}
1777
1778		status = switch_sem_etc(locking->lock, waitForLock, 1,
1779			B_CAN_INTERRUPT, 0);
1780		if (status != B_OK && status != B_BAD_SEM_ID)
1781			return status;
1782
1783		// We have been notified, but we need to re-lock the locking object. So
1784		// go another round...
1785	}
1786
1787	// install new lock
1788
1789	struct advisory_lock* lock = new(std::nothrow) advisory_lock;
1790	if (lock == NULL) {
1791		put_advisory_locking(locking);
1792		return B_NO_MEMORY;
1793	}
1794
1795	lock->bound_to = boundTo;
1796	lock->team = team_get_current_team_id();
1797	lock->session = thread_get_current_thread()->team->session_id;
1798	// values must already be normalized when getting here
1799	lock->start = flock->l_start;
1800	lock->end = flock->l_start - 1 + flock->l_len;
1801	lock->shared = shared;
1802
1803	locking->locks.Add(lock);
1804	put_advisory_locking(locking);
1805
1806	return status;
1807}
1808
1809
1810/*!	Normalizes the \a flock structure to make it easier to compare the
1811	structure with others. The l_start and l_len fields are set to absolute
1812	values according to the l_whence field.
1813*/
1814static status_t
1815normalize_flock(struct file_descriptor* descriptor, struct flock* flock)
1816{
1817	switch (flock->l_whence) {
1818		case SEEK_SET:
1819			break;
1820		case SEEK_CUR:
1821			flock->l_start += descriptor->pos;
1822			break;
1823		case SEEK_END:
1824		{
1825			struct vnode* vnode = descriptor->u.vnode;
1826			struct stat stat;
1827			status_t status;
1828
1829			if (!HAS_FS_CALL(vnode, read_stat))
1830				return B_UNSUPPORTED;
1831
1832			status = FS_CALL(vnode, read_stat, &stat);
1833			if (status != B_OK)
1834				return status;
1835
1836			flock->l_start += stat.st_size;
1837			break;
1838		}
1839		default:
1840			return B_BAD_VALUE;
1841	}
1842
1843	if (flock->l_start < 0)
1844		flock->l_start = 0;
1845	if (flock->l_len == 0)
1846		flock->l_len = OFF_MAX;
1847
1848	// don't let the offset and length overflow
1849	if (flock->l_start > 0 && OFF_MAX - flock->l_start < flock->l_len)
1850		flock->l_len = OFF_MAX - flock->l_start;
1851
1852	if (flock->l_len < 0) {
1853		// a negative length reverses the region
1854		flock->l_start += flock->l_len;
1855		flock->l_len = -flock->l_len;
1856	}
1857
1858	return B_OK;
1859}
1860
1861
1862static void
1863replace_vnode_if_disconnected(struct fs_mount* mount,
1864	struct vnode* vnodeToDisconnect, struct vnode*& vnode,
1865	struct vnode* fallBack, bool lockRootLock)
1866{
1867	struct vnode* givenVnode = vnode;
1868	bool vnodeReplaced = false;
1869
1870	ReadLocker vnodeReadLocker(sVnodeLock);
1871
1872	if (lockRootLock)
1873		mutex_lock(&sIOContextRootLock);
1874
1875	while (vnode != NULL && vnode->mount == mount
1876		&& (vnodeToDisconnect == NULL || vnodeToDisconnect == vnode)) {
1877		if (vnode->covers != NULL) {
1878			// redirect the vnode to the covered vnode
1879			vnode = vnode->covers;
1880		} else
1881			vnode = fallBack;
1882
1883		vnodeReplaced = true;
1884	}
1885
1886	// If we've replaced the node, grab a reference for the new one.
1887	if (vnodeReplaced && vnode != NULL)
1888		inc_vnode_ref_count(vnode);
1889
1890	if (lockRootLock)
1891		mutex_unlock(&sIOContextRootLock);
1892
1893	vnodeReadLocker.Unlock();
1894
1895	if (vnodeReplaced)
1896		put_vnode(givenVnode);
1897}
1898
1899
1900/*!	Disconnects all file descriptors that are associated with the
1901	\a vnodeToDisconnect, or if this is NULL, all vnodes of the specified
1902	\a mount object.
1903
1904	Note, after you've called this function, there might still be ongoing
1905	accesses - they won't be interrupted if they already happened before.
1906	However, any subsequent access will fail.
1907
1908	This is not a cheap function and should be used with care and rarely.
1909	TODO: there is currently no means to stop a blocking read/write!
1910*/
1911static void
1912disconnect_mount_or_vnode_fds(struct fs_mount* mount,
1913	struct vnode* vnodeToDisconnect)
1914{
1915	// iterate over all teams and peek into their file descriptors
1916	TeamListIterator teamIterator;
1917	while (Team* team = teamIterator.Next()) {
1918		BReference<Team> teamReference(team, true);
1919		TeamLocker teamLocker(team);
1920
1921		// lock the I/O context
1922		io_context* context = team->io_context;
1923		if (context == NULL)
1924			continue;
1925		MutexLocker contextLocker(context->io_mutex);
1926
1927		teamLocker.Unlock();
1928
1929		replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->root,
1930			sRoot, true);
1931		replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->cwd,
1932			sRoot, false);
1933
1934		for (uint32 i = 0; i < context->table_size; i++) {
1935			struct file_descriptor* descriptor = context->fds[i];
1936			if (descriptor == NULL || (descriptor->open_mode & O_DISCONNECTED) != 0)
1937				continue;
1938
1939			inc_fd_ref_count(descriptor);
1940
1941			// if this descriptor points at this mount, we
1942			// need to disconnect it to be able to unmount
1943			struct vnode* vnode = fd_vnode(descriptor);
1944			if (vnodeToDisconnect != NULL) {
1945				if (vnode == vnodeToDisconnect)
1946					disconnect_fd(descriptor);
1947			} else if ((vnode != NULL && vnode->mount == mount)
1948				|| (vnode == NULL && descriptor->u.mount == mount))
1949				disconnect_fd(descriptor);
1950
1951			put_fd(descriptor);
1952		}
1953	}
1954}
1955
1956
1957/*!	\brief Gets the root node of the current IO context.
1958	If \a kernel is \c true, the kernel IO context will be used.
1959	The caller obtains a reference to the returned node.
1960*/
1961struct vnode*
1962get_root_vnode(bool kernel)
1963{
1964	if (!kernel) {
1965		// Get current working directory from io context
1966		struct io_context* context = get_current_io_context(kernel);
1967
1968		mutex_lock(&sIOContextRootLock);
1969
1970		struct vnode* root = context->root;
1971		if (root != NULL)
1972			inc_vnode_ref_count(root);
1973
1974		mutex_unlock(&sIOContextRootLock);
1975
1976		if (root != NULL)
1977			return root;
1978
1979		// That should never happen.
1980		dprintf("get_root_vnode(): IO context for team %" B_PRId32 " doesn't "
1981			"have a root\n", team_get_current_team_id());
1982	}
1983
1984	inc_vnode_ref_count(sRoot);
1985	return sRoot;
1986}
1987
1988
1989/*!	\brief Gets the directory path and leaf name for a given path.
1990
1991	The supplied \a path is transformed to refer to the directory part of
1992	the entry identified by the original path, and into the buffer \a filename
1993	the leaf name of the original entry is written.
1994	Neither the returned path nor the leaf name can be expected to be
1995	canonical.
1996
1997	\param path The path to be analyzed. Must be able to store at least one
1998		   additional character.
1999	\param filename The buffer into which the leaf name will be written.
2000		   Must be of size B_FILE_NAME_LENGTH at least.
2001	\return \c B_OK, if everything went fine, \c B_NAME_TOO_LONG, if the leaf
2002		   name is longer than \c B_FILE_NAME_LENGTH, or \c B_ENTRY_NOT_FOUND,
2003		   if the given path name is empty.
2004*/
2005static status_t
2006get_dir_path_and_leaf(char* path, char* filename)
2007{
2008	if (*path == '\0')
2009		return B_ENTRY_NOT_FOUND;
2010
2011	char* last = strrchr(path, '/');
2012		// '/' are not allowed in file names!
2013
2014	FUNCTION(("get_dir_path_and_leaf(path = %s)\n", path));
2015
2016	if (last == NULL) {
2017		// this path is single segment with no '/' in it
2018		// ex. "foo"
2019		if (strlcpy(filename, path, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2020			return B_NAME_TOO_LONG;
2021
2022		strcpy(path, ".");
2023	} else {
2024		last++;
2025		if (last[0] == '\0') {
2026			// special case: the path ends in one or more '/' - remove them
2027			while (*--last == '/' && last != path);
2028			last[1] = '\0';
2029
2030			if (last == path && last[0] == '/') {
2031				// This path points to the root of the file system
2032				strcpy(filename, ".");
2033				return B_OK;
2034			}
2035			for (; last != path && *(last - 1) != '/'; last--);
2036				// rewind to the start of the leaf before the '/'
2037		}
2038
2039		// normal leaf: replace the leaf portion of the path with a '.'
2040		if (strlcpy(filename, last, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2041			return B_NAME_TOO_LONG;
2042
2043		last[0] = '.';
2044		last[1] = '\0';
2045	}
2046	return B_OK;
2047}
2048
2049
2050static status_t
2051entry_ref_to_vnode(dev_t mountID, ino_t directoryID, const char* name,
2052	bool traverse, bool kernel, VnodePutter& _vnode)
2053{
2054	char clonedName[B_FILE_NAME_LENGTH + 1];
2055	if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2056		return B_NAME_TOO_LONG;
2057
2058	// get the directory vnode and let vnode_path_to_vnode() do the rest
2059	struct vnode* directory;
2060
2061	status_t status = get_vnode(mountID, directoryID, &directory, true, false);
2062	if (status < 0)
2063		return status;
2064
2065	return vnode_path_to_vnode(directory, clonedName, traverse, kernel,
2066		_vnode, NULL);
2067}
2068
2069
2070/*!	Looks up the entry with name \a name in the directory represented by \a dir
2071	and returns the respective vnode.
2072	On success a reference to the vnode is acquired for the caller.
2073*/
2074static status_t
2075lookup_dir_entry(struct vnode* dir, const char* name, struct vnode** _vnode)
2076{
2077	ino_t id;
2078	bool missing;
2079
2080	if (dir->mount->entry_cache.Lookup(dir->id, name, id, missing)) {
2081		return missing ? B_ENTRY_NOT_FOUND
2082			: get_vnode(dir->device, id, _vnode, true, false);
2083	}
2084
2085	status_t status = FS_CALL(dir, lookup, name, &id);
2086	if (status != B_OK)
2087		return status;
2088
2089	// The lookup() hook calls get_vnode() or publish_vnode(), so we do already
2090	// have a reference and just need to look the node up.
2091	rw_lock_read_lock(&sVnodeLock);
2092	*_vnode = lookup_vnode(dir->device, id);
2093	rw_lock_read_unlock(&sVnodeLock);
2094
2095	if (*_vnode == NULL) {
2096		panic("lookup_dir_entry(): could not lookup vnode (mountid 0x%" B_PRIx32
2097			" vnid 0x%" B_PRIx64 ")\n", dir->device, id);
2098		return B_ENTRY_NOT_FOUND;
2099	}
2100
2101//	ktrace_printf("lookup_dir_entry(): dir: %p (%ld, %lld), name: \"%s\" -> "
2102//		"%p (%ld, %lld)", dir, dir->mount->id, dir->id, name, *_vnode,
2103//		(*_vnode)->mount->id, (*_vnode)->id);
2104
2105	return B_OK;
2106}
2107
2108
2109/*!	Returns the vnode for the relative \a path starting at the specified \a vnode.
2110
2111	\param[in,out] path The relative path being searched. Must not be NULL.
2112	If the function returns successfully, \a path contains the name of the last path
2113	component. This function clobbers the buffer pointed to by \a path only
2114	if it does contain more than one component.
2115
2116	If the function fails and leafName is not NULL, \a _vnode contains the last directory,
2117	the caller has the responsibility to call put_vnode() on it.
2118
2119	Note, this reduces the ref_count of the starting \a vnode, no matter if
2120	it is successful or not!
2121
2122	\param[out] _vnode If the function returns B_OK, points to the found node.
2123	\param[out] _vnode If the function returns something else and leafname is not NULL: set to the
2124		last existing directory in the path. The caller has responsibility to release it using
2125		put_vnode().
2126	\param[out] _vnode If the function returns something else and leafname is NULL: not used.
2127*/
2128static status_t
2129vnode_path_to_vnode(struct vnode* start, char* path, bool traverseLeafLink,
2130	int count, struct io_context* ioContext, VnodePutter& _vnode,
2131	ino_t* _parentID, char* leafName)
2132{
2133	FUNCTION(("vnode_path_to_vnode(vnode = %p, path = %s)\n", vnode, path));
2134	ASSERT(!_vnode.IsSet());
2135
2136	VnodePutter vnode(start);
2137
2138	if (path == NULL)
2139		return B_BAD_VALUE;
2140	if (*path == '\0')
2141		return B_ENTRY_NOT_FOUND;
2142
2143	status_t status = B_OK;
2144	ino_t lastParentID = vnode->id;
2145	while (true) {
2146		char* nextPath;
2147
2148		TRACE(("vnode_path_to_vnode: top of loop. p = %p, p = '%s'\n", path,
2149			path));
2150
2151		// done?
2152		if (path[0] == '\0')
2153			break;
2154
2155		// walk to find the next path component ("path" will point to a single
2156		// path component), and filter out multiple slashes
2157		for (nextPath = path + 1; *nextPath != '\0' && *nextPath != '/';
2158				nextPath++);
2159
2160		bool directoryFound = false;
2161		if (*nextPath == '/') {
2162			directoryFound = true;
2163			*nextPath = '\0';
2164			do
2165				nextPath++;
2166			while (*nextPath == '/');
2167		}
2168
2169		// See if the '..' is at a covering vnode move to the covered
2170		// vnode so we pass the '..' path to the underlying filesystem.
2171		// Also prevent breaking the root of the IO context.
2172		if (strcmp("..", path) == 0) {
2173			if (vnode.Get() == ioContext->root) {
2174				// Attempted prison break! Keep it contained.
2175				path = nextPath;
2176				continue;
2177			}
2178
2179			if (Vnode* coveredVnode = get_covered_vnode(vnode.Get()))
2180				vnode.SetTo(coveredVnode);
2181		}
2182
2183		// check if vnode is really a directory
2184		if (status == B_OK && !S_ISDIR(vnode->Type()))
2185			status = B_NOT_A_DIRECTORY;
2186
2187		// Check if we have the right to search the current directory vnode.
2188		// If a file system doesn't have the access() function, we assume that
2189		// searching a directory is always allowed
2190		if (status == B_OK && HAS_FS_CALL(vnode, access))
2191			status = FS_CALL(vnode.Get(), access, X_OK);
2192
2193		// Tell the filesystem to get the vnode of this path component (if we
2194		// got the permission from the call above)
2195		VnodePutter nextVnode;
2196		if (status == B_OK) {
2197			struct vnode* temp = NULL;
2198			status = lookup_dir_entry(vnode.Get(), path, &temp);
2199			nextVnode.SetTo(temp);
2200		}
2201
2202		if (status != B_OK) {
2203			if (leafName != NULL) {
2204				strlcpy(leafName, path, B_FILE_NAME_LENGTH);
2205				_vnode.SetTo(vnode.Detach());
2206			}
2207			return status;
2208		}
2209
2210		// If the new node is a symbolic link, resolve it (if we've been told
2211		// to do it)
2212		if (S_ISLNK(nextVnode->Type())
2213			&& (traverseLeafLink || directoryFound)) {
2214			size_t bufferSize;
2215			char* buffer;
2216
2217			TRACE(("traverse link\n"));
2218
2219			if (count + 1 > B_MAX_SYMLINKS)
2220				return B_LINK_LIMIT;
2221
2222			bufferSize = B_PATH_NAME_LENGTH;
2223			buffer = (char*)object_cache_alloc(sPathNameCache, 0);
2224			if (buffer == NULL)
2225				return B_NO_MEMORY;
2226
2227			if (HAS_FS_CALL(nextVnode, read_symlink)) {
2228				bufferSize--;
2229				status = FS_CALL(nextVnode.Get(), read_symlink, buffer, &bufferSize);
2230				// null-terminate
2231				if (status >= 0 && bufferSize < B_PATH_NAME_LENGTH)
2232					buffer[bufferSize] = '\0';
2233			} else
2234				status = B_BAD_VALUE;
2235
2236			if (status != B_OK) {
2237				free(buffer);
2238				return status;
2239			}
2240			nextVnode.Unset();
2241
2242			// Check if we start from the root directory or the current
2243			// directory ("vnode" still points to that one).
2244			// Cut off all leading slashes if it's the root directory
2245			path = buffer;
2246			bool absoluteSymlink = false;
2247			if (path[0] == '/') {
2248				// we don't need the old directory anymore
2249				vnode.Unset();
2250
2251				while (*++path == '/')
2252					;
2253
2254				mutex_lock(&sIOContextRootLock);
2255				vnode.SetTo(ioContext->root);
2256				inc_vnode_ref_count(vnode.Get());
2257				mutex_unlock(&sIOContextRootLock);
2258
2259				absoluteSymlink = true;
2260			}
2261
2262			inc_vnode_ref_count(vnode.Get());
2263				// balance the next recursion - we will decrement the
2264				// ref_count of the vnode, no matter if we succeeded or not
2265
2266			if (absoluteSymlink && *path == '\0') {
2267				// symlink was just "/"
2268				nextVnode.SetTo(vnode.Get());
2269			} else {
2270				status = vnode_path_to_vnode(vnode.Get(), path, true, count + 1,
2271					ioContext, nextVnode, &lastParentID, leafName);
2272			}
2273
2274			object_cache_free(sPathNameCache, buffer, 0);
2275
2276			if (status != B_OK) {
2277				if (leafName != NULL)
2278					_vnode.SetTo(nextVnode.Detach());
2279				return status;
2280			}
2281		} else
2282			lastParentID = vnode->id;
2283
2284		// decrease the ref count on the old dir we just looked up into
2285		vnode.Unset();
2286
2287		path = nextPath;
2288		vnode.SetTo(nextVnode.Detach());
2289
2290		// see if we hit a covered node
2291		if (Vnode* coveringNode = get_covering_vnode(vnode.Get()))
2292			vnode.SetTo(coveringNode);
2293	}
2294
2295	_vnode.SetTo(vnode.Detach());
2296	if (_parentID)
2297		*_parentID = lastParentID;
2298
2299	return B_OK;
2300}
2301
2302
2303static status_t
2304vnode_path_to_vnode(struct vnode* vnode, char* path, bool traverseLeafLink,
2305	bool kernel, VnodePutter& _vnode, ino_t* _parentID, char* leafName)
2306{
2307	return vnode_path_to_vnode(vnode, path, traverseLeafLink, 0,
2308		get_current_io_context(kernel), _vnode, _parentID, leafName);
2309}
2310
2311
2312static status_t
2313path_to_vnode(char* path, bool traverseLink, VnodePutter& _vnode,
2314	ino_t* _parentID, bool kernel)
2315{
2316	struct vnode* start = NULL;
2317
2318	FUNCTION(("path_to_vnode(path = \"%s\")\n", path));
2319
2320	if (!path)
2321		return B_BAD_VALUE;
2322
2323	if (*path == '\0')
2324		return B_ENTRY_NOT_FOUND;
2325
2326	// figure out if we need to start at root or at cwd
2327	if (*path == '/') {
2328		if (sRoot == NULL) {
2329			// we're a bit early, aren't we?
2330			return B_ERROR;
2331		}
2332
2333		while (*++path == '/')
2334			;
2335		start = get_root_vnode(kernel);
2336
2337		if (*path == '\0') {
2338			_vnode.SetTo(start);
2339			return B_OK;
2340		}
2341
2342	} else {
2343		struct io_context* context = get_current_io_context(kernel);
2344
2345		mutex_lock(&context->io_mutex);
2346		start = context->cwd;
2347		if (start != NULL)
2348			inc_vnode_ref_count(start);
2349		mutex_unlock(&context->io_mutex);
2350
2351		if (start == NULL)
2352			return B_ERROR;
2353	}
2354
2355	return vnode_path_to_vnode(start, path, traverseLink, kernel, _vnode,
2356		_parentID);
2357}
2358
2359
2360/*! Returns the vnode in the next to last segment of the path, and returns
2361	the last portion in filename.
2362	The path buffer must be able to store at least one additional character.
2363*/
2364static status_t
2365path_to_dir_vnode(char* path, VnodePutter& _vnode, char* filename,
2366	bool kernel)
2367{
2368	status_t status = get_dir_path_and_leaf(path, filename);
2369	if (status != B_OK)
2370		return status;
2371
2372	return path_to_vnode(path, true, _vnode, NULL, kernel);
2373}
2374
2375
2376/*!	\brief Retrieves the directory vnode and the leaf name of an entry referred
2377		   to by a FD + path pair.
2378
2379	\a path must be given in either case. \a fd might be omitted, in which
2380	case \a path is either an absolute path or one relative to the current
2381	directory. If both a supplied and \a path is relative it is reckoned off
2382	of the directory referred to by \a fd. If \a path is absolute \a fd is
2383	ignored.
2384
2385	The caller has the responsibility to call put_vnode() on the returned
2386	directory vnode.
2387
2388	\param fd The FD. May be < 0.
2389	\param path The absolute or relative path. Must not be \c NULL. The buffer
2390	       is modified by this function. It must have at least room for a
2391	       string one character longer than the path it contains.
2392	\param _vnode A pointer to a variable the directory vnode shall be written
2393		   into.
2394	\param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2395		   the leaf name of the specified entry will be written.
2396	\param kernel \c true, if invoked from inside the kernel, \c false if
2397		   invoked from userland.
2398	\return \c B_OK, if everything went fine, another error code otherwise.
2399*/
2400static status_t
2401fd_and_path_to_dir_vnode(int fd, char* path, VnodePutter& _vnode,
2402	char* filename, bool kernel)
2403{
2404	if (!path)
2405		return B_BAD_VALUE;
2406	if (*path == '\0')
2407		return B_ENTRY_NOT_FOUND;
2408	if (fd < 0)
2409		return path_to_dir_vnode(path, _vnode, filename, kernel);
2410
2411	status_t status = get_dir_path_and_leaf(path, filename);
2412	if (status != B_OK)
2413		return status;
2414
2415	return fd_and_path_to_vnode(fd, path, true, _vnode, NULL, kernel);
2416}
2417
2418
2419/*!	\brief Retrieves the directory vnode and the leaf name of an entry referred
2420		   to by a vnode + path pair.
2421
2422	\a path must be given in either case. \a vnode might be omitted, in which
2423	case \a path is either an absolute path or one relative to the current
2424	directory. If both a supplied and \a path is relative it is reckoned off
2425	of the directory referred to by \a vnode. If \a path is absolute \a vnode is
2426	ignored.
2427
2428	The caller has the responsibility to call put_vnode() on the returned
2429	directory vnode.
2430
2431	Note, this reduces the ref_count of the starting \a vnode, no matter if
2432	it is successful or not.
2433
2434	\param vnode The vnode. May be \c NULL.
2435	\param path The absolute or relative path. Must not be \c NULL. The buffer
2436	       is modified by this function. It must have at least room for a
2437	       string one character longer than the path it contains.
2438	\param _vnode A pointer to a variable the directory vnode shall be written
2439		   into.
2440	\param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2441		   the leaf name of the specified entry will be written.
2442	\param kernel \c true, if invoked from inside the kernel, \c false if
2443		   invoked from userland.
2444	\return \c B_OK, if everything went fine, another error code otherwise.
2445*/
2446static status_t
2447vnode_and_path_to_dir_vnode(struct vnode* vnode, char* path,
2448	VnodePutter& _vnode, char* filename, bool kernel)
2449{
2450	VnodePutter vnodePutter(vnode);
2451
2452	if (!path)
2453		return B_BAD_VALUE;
2454	if (*path == '\0')
2455		return B_ENTRY_NOT_FOUND;
2456	if (vnode == NULL || path[0] == '/')
2457		return path_to_dir_vnode(path, _vnode, filename, kernel);
2458
2459	status_t status = get_dir_path_and_leaf(path, filename);
2460	if (status != B_OK)
2461		return status;
2462
2463	vnodePutter.Detach();
2464	return vnode_path_to_vnode(vnode, path, true, kernel, _vnode, NULL);
2465}
2466
2467
2468/*! Returns a vnode's name in the d_name field of a supplied dirent buffer.
2469*/
2470static status_t
2471get_vnode_name(struct vnode* vnode, struct vnode* parent, struct dirent* buffer,
2472	size_t bufferSize, struct io_context* ioContext)
2473{
2474	if (bufferSize < sizeof(struct dirent))
2475		return B_BAD_VALUE;
2476
2477	// See if the vnode is covering another vnode and move to the covered
2478	// vnode so we get the underlying file system
2479	VnodePutter vnodePutter;
2480	if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2481		vnode = coveredVnode;
2482		vnodePutter.SetTo(vnode);
2483	}
2484
2485	if (HAS_FS_CALL(vnode, get_vnode_name)) {
2486		// The FS supports getting the name of a vnode.
2487		if (FS_CALL(vnode, get_vnode_name, buffer->d_name,
2488			(char*)buffer + bufferSize - buffer->d_name) == B_OK)
2489			return B_OK;
2490	}
2491
2492	// The FS doesn't support getting the name of a vnode. So we search the
2493	// parent directory for the vnode, if the caller let us.
2494
2495	if (parent == NULL || !HAS_FS_CALL(parent, read_dir))
2496		return B_UNSUPPORTED;
2497
2498	void* cookie;
2499
2500	status_t status = FS_CALL(parent, open_dir, &cookie);
2501	if (status >= B_OK) {
2502		while (true) {
2503			uint32 num = 1;
2504			// We use the FS hook directly instead of dir_read(), since we don't
2505			// want the entries to be fixed. We have already resolved vnode to
2506			// the covered node.
2507			status = FS_CALL(parent, read_dir, cookie, buffer, bufferSize,
2508				&num);
2509			if (status != B_OK)
2510				break;
2511			if (num == 0) {
2512				status = B_ENTRY_NOT_FOUND;
2513				break;
2514			}
2515
2516			if (vnode->id == buffer->d_ino) {
2517				// found correct entry!
2518				break;
2519			}
2520		}
2521
2522		FS_CALL(parent, close_dir, cookie);
2523		FS_CALL(parent, free_dir_cookie, cookie);
2524	}
2525	return status;
2526}
2527
2528
2529static status_t
2530get_vnode_name(struct vnode* vnode, struct vnode* parent, char* name,
2531	size_t nameSize, bool kernel)
2532{
2533	char buffer[offsetof(struct dirent, d_name) + B_FILE_NAME_LENGTH + 1];
2534	struct dirent* dirent = (struct dirent*)buffer;
2535
2536	status_t status = get_vnode_name(vnode, parent, dirent, sizeof(buffer),
2537		get_current_io_context(kernel));
2538	if (status != B_OK)
2539		return status;
2540
2541	if (strlcpy(name, dirent->d_name, nameSize) >= nameSize)
2542		return B_BUFFER_OVERFLOW;
2543
2544	return B_OK;
2545}
2546
2547
2548/*!	Gets the full path to a given directory vnode.
2549	It uses the fs_get_vnode_name() call to get the name of a vnode; if a
2550	file system doesn't support this call, it will fall back to iterating
2551	through the parent directory to get the name of the child.
2552
2553	To protect against circular loops, it supports a maximum tree depth
2554	of 256 levels.
2555
2556	Note that the path may not be correct the time this function returns!
2557	It doesn't use any locking to prevent returning the correct path, as
2558	paths aren't safe anyway: the path to a file can change at any time.
2559
2560	It might be a good idea, though, to check if the returned path exists
2561	in the calling function (it's not done here because of efficiency)
2562*/
2563static status_t
2564dir_vnode_to_path(struct vnode* vnode, char* buffer, size_t bufferSize,
2565	bool kernel)
2566{
2567	FUNCTION(("dir_vnode_to_path(%p, %p, %lu)\n", vnode, buffer, bufferSize));
2568
2569	if (vnode == NULL || buffer == NULL || bufferSize == 0)
2570		return B_BAD_VALUE;
2571
2572	if (!S_ISDIR(vnode->Type()))
2573		return B_NOT_A_DIRECTORY;
2574
2575	char* path = buffer;
2576	int32 insert = bufferSize;
2577	int32 maxLevel = 256;
2578	int32 length;
2579	status_t status = B_OK;
2580	struct io_context* ioContext = get_current_io_context(kernel);
2581
2582	// we don't use get_vnode() here because this call is more
2583	// efficient and does all we need from get_vnode()
2584	inc_vnode_ref_count(vnode);
2585
2586	path[--insert] = '\0';
2587		// the path is filled right to left
2588
2589	while (true) {
2590		// If the node is the context's root, bail out. Otherwise resolve mount
2591		// points.
2592		if (vnode == ioContext->root)
2593			break;
2594
2595		if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2596			put_vnode(vnode);
2597			vnode = coveredVnode;
2598		}
2599
2600		// lookup the parent vnode
2601		struct vnode* parentVnode;
2602		status = lookup_dir_entry(vnode, "..", &parentVnode);
2603		if (status != B_OK)
2604			goto out;
2605
2606		if (parentVnode == vnode) {
2607			// The caller apparently got their hands on a node outside of their
2608			// context's root. Now we've hit the global root.
2609			put_vnode(parentVnode);
2610			break;
2611		}
2612
2613		// get the node's name
2614		char nameBuffer[offsetof(struct dirent, d_name) + B_FILE_NAME_LENGTH + 1];
2615			// also used for fs_read_dir()
2616		char* name = &((struct dirent*)nameBuffer)->d_name[0];
2617		status = get_vnode_name(vnode, parentVnode, (struct dirent*)nameBuffer,
2618			sizeof(nameBuffer), ioContext);
2619
2620		// release the current vnode, we only need its parent from now on
2621		put_vnode(vnode);
2622		vnode = parentVnode;
2623
2624		if (status != B_OK)
2625			goto out;
2626
2627		// TODO: add an explicit check for loops in about 10 levels to do
2628		// real loop detection
2629
2630		// don't go deeper as 'maxLevel' to prevent circular loops
2631		if (maxLevel-- < 0) {
2632			status = B_LINK_LIMIT;
2633			goto out;
2634		}
2635
2636		// add the name in front of the current path
2637		name[B_FILE_NAME_LENGTH - 1] = '\0';
2638		length = strlen(name);
2639		insert -= length;
2640		if (insert <= 0) {
2641			status = B_RESULT_NOT_REPRESENTABLE;
2642			goto out;
2643		}
2644		memcpy(path + insert, name, length);
2645		path[--insert] = '/';
2646	}
2647
2648	// the root dir will result in an empty path: fix it
2649	if (path[insert] == '\0')
2650		path[--insert] = '/';
2651
2652	TRACE(("  path is: %s\n", path + insert));
2653
2654	// move the path to the start of the buffer
2655	length = bufferSize - insert;
2656	memmove(buffer, path + insert, length);
2657
2658out:
2659	put_vnode(vnode);
2660	return status;
2661}
2662
2663
2664/*!	Checks the length of every path component, and adds a '.'
2665	if the path ends in a slash.
2666	The given path buffer must be able to store at least one
2667	additional character.
2668*/
2669static status_t
2670check_path(char* to)
2671{
2672	int32 length = 0;
2673
2674	// check length of every path component
2675
2676	while (*to) {
2677		char* begin;
2678		if (*to == '/')
2679			to++, length++;
2680
2681		begin = to;
2682		while (*to != '/' && *to)
2683			to++, length++;
2684
2685		if (to - begin > B_FILE_NAME_LENGTH)
2686			return B_NAME_TOO_LONG;
2687	}
2688
2689	if (length == 0)
2690		return B_ENTRY_NOT_FOUND;
2691
2692	// complete path if there is a slash at the end
2693
2694	if (*(to - 1) == '/') {
2695		if (length > B_PATH_NAME_LENGTH - 2)
2696			return B_NAME_TOO_LONG;
2697
2698		to[0] = '.';
2699		to[1] = '\0';
2700	}
2701
2702	return B_OK;
2703}
2704
2705
2706static struct file_descriptor*
2707get_fd_and_vnode(int fd, struct vnode** _vnode, bool kernel)
2708{
2709	struct file_descriptor* descriptor
2710		= get_fd(get_current_io_context(kernel), fd);
2711	if (descriptor == NULL)
2712		return NULL;
2713
2714	struct vnode* vnode = fd_vnode(descriptor);
2715	if (vnode == NULL) {
2716		put_fd(descriptor);
2717		return NULL;
2718	}
2719
2720	// ToDo: when we can close a file descriptor at any point, investigate
2721	//	if this is still valid to do (accessing the vnode without ref_count
2722	//	or locking)
2723	*_vnode = vnode;
2724	return descriptor;
2725}
2726
2727
2728static struct vnode*
2729get_vnode_from_fd(int fd, bool kernel)
2730{
2731	struct file_descriptor* descriptor;
2732	struct vnode* vnode;
2733
2734	descriptor = get_fd(get_current_io_context(kernel), fd);
2735	if (descriptor == NULL)
2736		return NULL;
2737
2738	vnode = fd_vnode(descriptor);
2739	if (vnode != NULL)
2740		inc_vnode_ref_count(vnode);
2741
2742	put_fd(descriptor);
2743	return vnode;
2744}
2745
2746
2747/*!	Gets the vnode from an FD + path combination. If \a fd is lower than zero,
2748	only the path will be considered. In this case, the \a path must not be
2749	NULL.
2750	If \a fd is a valid file descriptor, \a path may be NULL for directories,
2751	and should be NULL for files.
2752*/
2753static status_t
2754fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
2755	VnodePutter& _vnode, ino_t* _parentID, bool kernel)
2756{
2757	if (fd < 0 && !path)
2758		return B_BAD_VALUE;
2759
2760	if (path != NULL && *path == '\0')
2761		return B_ENTRY_NOT_FOUND;
2762
2763	if (fd < 0 || (path != NULL && path[0] == '/')) {
2764		// no FD or absolute path
2765		return path_to_vnode(path, traverseLeafLink, _vnode, _parentID, kernel);
2766	}
2767
2768	// FD only, or FD + relative path
2769	struct vnode* vnode = get_vnode_from_fd(fd, kernel);
2770	if (vnode == NULL)
2771		return B_FILE_ERROR;
2772
2773	if (path != NULL) {
2774		return vnode_path_to_vnode(vnode, path, traverseLeafLink, kernel,
2775			_vnode, _parentID);
2776	}
2777
2778	// there is no relative path to take into account
2779
2780	_vnode.SetTo(vnode);
2781	if (_parentID)
2782		*_parentID = -1;
2783
2784	return B_OK;
2785}
2786
2787
2788static int
2789get_new_fd(int type, struct fs_mount* mount, struct vnode* vnode,
2790	void* cookie, int openMode, bool kernel)
2791{
2792	struct file_descriptor* descriptor;
2793	int fd;
2794
2795	// If the vnode is locked, we don't allow creating a new file/directory
2796	// file_descriptor for it
2797	if (vnode && vnode->mandatory_locked_by != NULL
2798		&& (type == FDTYPE_FILE || type == FDTYPE_DIR))
2799		return B_BUSY;
2800
2801	if ((openMode & O_RDWR) != 0 && (openMode & O_WRONLY) != 0)
2802		return B_BAD_VALUE;
2803
2804	descriptor = alloc_fd();
2805	if (!descriptor)
2806		return B_NO_MEMORY;
2807
2808	if (vnode)
2809		descriptor->u.vnode = vnode;
2810	else
2811		descriptor->u.mount = mount;
2812	descriptor->cookie = cookie;
2813
2814	switch (type) {
2815		// vnode types
2816		case FDTYPE_FILE:
2817			descriptor->ops = &sFileOps;
2818			break;
2819		case FDTYPE_DIR:
2820			descriptor->ops = &sDirectoryOps;
2821			break;
2822		case FDTYPE_ATTR:
2823			descriptor->ops = &sAttributeOps;
2824			break;
2825		case FDTYPE_ATTR_DIR:
2826			descriptor->ops = &sAttributeDirectoryOps;
2827			break;
2828
2829		// mount types
2830		case FDTYPE_INDEX_DIR:
2831			descriptor->ops = &sIndexDirectoryOps;
2832			break;
2833		case FDTYPE_QUERY:
2834			descriptor->ops = &sQueryOps;
2835			break;
2836
2837		default:
2838			panic("get_new_fd() called with unknown type %d\n", type);
2839			break;
2840	}
2841	descriptor->type = type;
2842	descriptor->open_mode = openMode;
2843
2844	if (descriptor->ops->fd_seek != NULL) {
2845		// some kinds of files are not seekable
2846		switch (vnode->Type() & S_IFMT) {
2847			case S_IFIFO:
2848			case S_IFSOCK:
2849				ASSERT(descriptor->pos == -1);
2850				break;
2851
2852			// The Open Group Base Specs don't mention any file types besides pipes,
2853			// FIFOs, and sockets specially, so we allow seeking all others.
2854			default:
2855				descriptor->pos = 0;
2856				break;
2857		}
2858	}
2859
2860	io_context* context = get_current_io_context(kernel);
2861	fd = new_fd(context, descriptor);
2862	if (fd < 0) {
2863		descriptor->ops = NULL;
2864		put_fd(descriptor);
2865		return B_NO_MORE_FDS;
2866	}
2867
2868	mutex_lock(&context->io_mutex);
2869	fd_set_close_on_exec(context, fd, (openMode & O_CLOEXEC) != 0);
2870	mutex_unlock(&context->io_mutex);
2871
2872	return fd;
2873}
2874
2875
2876/*!	In-place normalizes \a path. It's otherwise semantically equivalent to
2877	vfs_normalize_path(). See there for more documentation.
2878*/
2879static status_t
2880normalize_path(char* path, size_t pathSize, bool traverseLink, bool kernel)
2881{
2882	VnodePutter dir;
2883	status_t error;
2884
2885	for (int i = 0; i < B_MAX_SYMLINKS; i++) {
2886		// get dir vnode + leaf name
2887		char leaf[B_FILE_NAME_LENGTH];
2888		error = vnode_and_path_to_dir_vnode(dir.Detach(), path, dir, leaf, kernel);
2889		if (error != B_OK)
2890			return error;
2891		strcpy(path, leaf);
2892
2893		// get file vnode, if we shall resolve links
2894		bool fileExists = false;
2895		VnodePutter fileVnode;
2896		if (traverseLink) {
2897			inc_vnode_ref_count(dir.Get());
2898			if (vnode_path_to_vnode(dir.Get(), path, false, kernel, fileVnode,
2899					NULL) == B_OK) {
2900				fileExists = true;
2901			}
2902		}
2903
2904		if (!fileExists || !traverseLink || !S_ISLNK(fileVnode->Type())) {
2905			// we're done -- construct the path
2906			bool hasLeaf = true;
2907			if (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0) {
2908				// special cases "." and ".." -- get the dir, forget the leaf
2909				error = vnode_path_to_vnode(dir.Detach(), leaf, false, kernel,
2910					dir, NULL);
2911				if (error != B_OK)
2912					return error;
2913				hasLeaf = false;
2914			}
2915
2916			// get the directory path
2917			error = dir_vnode_to_path(dir.Get(), path, B_PATH_NAME_LENGTH, kernel);
2918			if (error != B_OK)
2919				return error;
2920
2921			// append the leaf name
2922			if (hasLeaf) {
2923				// insert a directory separator if this is not the file system
2924				// root
2925				if ((strcmp(path, "/") != 0
2926					&& strlcat(path, "/", pathSize) >= pathSize)
2927					|| strlcat(path, leaf, pathSize) >= pathSize) {
2928					return B_NAME_TOO_LONG;
2929				}
2930			}
2931
2932			return B_OK;
2933		}
2934
2935		// read link
2936		if (HAS_FS_CALL(fileVnode, read_symlink)) {
2937			size_t bufferSize = B_PATH_NAME_LENGTH - 1;
2938			error = FS_CALL(fileVnode.Get(), read_symlink, path, &bufferSize);
2939			if (error != B_OK)
2940				return error;
2941			if (bufferSize < B_PATH_NAME_LENGTH)
2942				path[bufferSize] = '\0';
2943		} else
2944			return B_BAD_VALUE;
2945	}
2946
2947	return B_LINK_LIMIT;
2948}
2949
2950
2951static status_t
2952resolve_covered_parent(struct vnode* parent, dev_t* _device, ino_t* _node,
2953	struct io_context* ioContext)
2954{
2955	// Make sure the IO context root is not bypassed.
2956	if (parent == ioContext->root) {
2957		*_device = parent->device;
2958		*_node = parent->id;
2959		return B_OK;
2960	}
2961
2962	inc_vnode_ref_count(parent);
2963		// vnode_path_to_vnode() puts the node
2964
2965	// ".." is guaranteed not to be clobbered by this call
2966	VnodePutter vnode;
2967	status_t status = vnode_path_to_vnode(parent, (char*)"..", false,
2968		ioContext, vnode, NULL);
2969	if (status == B_OK) {
2970		*_device = vnode->device;
2971		*_node = vnode->id;
2972	}
2973
2974	return status;
2975}
2976
2977
2978#ifdef ADD_DEBUGGER_COMMANDS
2979
2980
2981static void
2982_dump_advisory_locking(advisory_locking* locking)
2983{
2984	if (locking == NULL)
2985		return;
2986
2987	kprintf("   lock:        %" B_PRId32, locking->lock);
2988	kprintf("   wait_sem:    %" B_PRId32, locking->wait_sem);
2989
2990	int32 index = 0;
2991	LockList::Iterator iterator = locking->locks.GetIterator();
2992	while (iterator.HasNext()) {
2993		struct advisory_lock* lock = iterator.Next();
2994
2995		kprintf("   [%2" B_PRId32 "] team:   %" B_PRId32 "\n", index++, lock->team);
2996		kprintf("        start:  %" B_PRIdOFF "\n", lock->start);
2997		kprintf("        end:    %" B_PRIdOFF "\n", lock->end);
2998		kprintf("        shared? %s\n", lock->shared ? "yes" : "no");
2999	}
3000}
3001
3002
3003static void
3004_dump_mount(struct fs_mount* mount)
3005{
3006	kprintf("MOUNT: %p\n", mount);
3007	kprintf(" id:            %" B_PRIdDEV "\n", mount->id);
3008	kprintf(" device_name:   %s\n", mount->device_name);
3009	kprintf(" root_vnode:    %p\n", mount->root_vnode);
3010	kprintf(" covers:        %p\n", mount->root_vnode->covers);
3011	kprintf(" partition:     %p\n", mount->partition);
3012	kprintf(" lock:          %p\n", &mount->lock);
3013	kprintf(" flags:        %s%s\n", mount->unmounting ? " unmounting" : "",
3014		mount->owns_file_device ? " owns_file_device" : "");
3015
3016	fs_volume* volume = mount->volume;
3017	while (volume != NULL) {
3018		kprintf(" volume %p:\n", volume);
3019		kprintf("  layer:            %" B_PRId32 "\n", volume->layer);
3020		kprintf("  private_volume:   %p\n", volume->private_volume);
3021		kprintf("  ops:              %p\n", volume->ops);
3022		kprintf("  file_system:      %p\n", volume->file_system);
3023		kprintf("  file_system_name: %s\n", volume->file_system_name);
3024		volume = volume->super_volume;
3025	}
3026
3027	set_debug_variable("_volume", (addr_t)mount->volume->private_volume);
3028	set_debug_variable("_root", (addr_t)mount->root_vnode);
3029	set_debug_variable("_covers", (addr_t)mount->root_vnode->covers);
3030	set_debug_variable("_partition", (addr_t)mount->partition);
3031}
3032
3033
3034static bool
3035debug_prepend_vnode_name_to_path(char* buffer, size_t& bufferSize,
3036	const char* name)
3037{
3038	bool insertSlash = buffer[bufferSize] != '\0';
3039	size_t nameLength = strlen(name);
3040
3041	if (bufferSize < nameLength + (insertSlash ? 1 : 0))
3042		return false;
3043
3044	if (insertSlash)
3045		buffer[--bufferSize] = '/';
3046
3047	bufferSize -= nameLength;
3048	memcpy(buffer + bufferSize, name, nameLength);
3049
3050	return true;
3051}
3052
3053
3054static bool
3055debug_prepend_vnode_id_to_path(char* buffer, size_t& bufferSize, dev_t devID,
3056	ino_t nodeID)
3057{
3058	if (bufferSize == 0)
3059		return false;
3060
3061	bool insertSlash = buffer[bufferSize] != '\0';
3062	if (insertSlash)
3063		buffer[--bufferSize] = '/';
3064
3065	size_t size = snprintf(buffer, bufferSize,
3066		"<%" B_PRIdDEV ",%" B_PRIdINO ">", devID, nodeID);
3067	if (size > bufferSize) {
3068		if (insertSlash)
3069			bufferSize++;
3070		return false;
3071	}
3072
3073	if (size < bufferSize)
3074		memmove(buffer + bufferSize - size, buffer, size);
3075
3076	bufferSize -= size;
3077	return true;
3078}
3079
3080
3081static char*
3082debug_resolve_vnode_path(struct vnode* vnode, char* buffer, size_t bufferSize,
3083	bool& _truncated)
3084{
3085	// null-terminate the path
3086	buffer[--bufferSize] = '\0';
3087
3088	while (true) {
3089		while (vnode->covers != NULL)
3090			vnode = vnode->covers;
3091
3092		if (vnode == sRoot) {
3093			_truncated = bufferSize == 0;
3094			if (!_truncated)
3095				buffer[--bufferSize] = '/';
3096			return buffer + bufferSize;
3097		}
3098
3099		// resolve the name
3100		ino_t dirID;
3101		const char* name = vnode->mount->entry_cache.DebugReverseLookup(
3102			vnode->id, dirID);
3103		if (name == NULL) {
3104			// Failed to resolve the name -- prepend "<dev,node>/".
3105			_truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3106				vnode->mount->id, vnode->id);
3107			return buffer + bufferSize;
3108		}
3109
3110		// prepend the name
3111		if (!debug_prepend_vnode_name_to_path(buffer, bufferSize, name)) {
3112			_truncated = true;
3113			return buffer + bufferSize;
3114		}
3115
3116		// resolve the directory node
3117		struct vnode* nextVnode = lookup_vnode(vnode->mount->id, dirID);
3118		if (nextVnode == NULL) {
3119			_truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3120				vnode->mount->id, dirID);
3121			return buffer + bufferSize;
3122		}
3123
3124		vnode = nextVnode;
3125	}
3126}
3127
3128
3129static void
3130_dump_vnode(struct vnode* vnode, bool printPath)
3131{
3132	kprintf("VNODE: %p\n", vnode);
3133	kprintf(" device:        %" B_PRIdDEV "\n", vnode->device);
3134	kprintf(" id:            %" B_PRIdINO "\n", vnode->id);
3135	kprintf(" ref_count:     %" B_PRId32 "\n", vnode->ref_count);
3136	kprintf(" private_node:  %p\n", vnode->private_node);
3137	kprintf(" mount:         %p\n", vnode->mount);
3138	kprintf(" covered_by:    %p\n", vnode->covered_by);
3139	kprintf(" covers:        %p\n", vnode->covers);
3140	kprintf(" cache:         %p\n", vnode->cache);
3141	kprintf(" type:          %#" B_PRIx32 "\n", vnode->Type());
3142	kprintf(" flags:         %s%s%s\n", vnode->IsRemoved() ? "r" : "-",
3143		vnode->IsBusy() ? "b" : "-", vnode->IsUnpublished() ? "u" : "-");
3144	kprintf(" advisory_lock: %p\n", vnode->advisory_locking);
3145
3146	_dump_advisory_locking(vnode->advisory_locking);
3147
3148	if (printPath) {
3149		void* buffer = debug_malloc(B_PATH_NAME_LENGTH);
3150		if (buffer != NULL) {
3151			bool truncated;
3152			char* path = debug_resolve_vnode_path(vnode, (char*)buffer,
3153				B_PATH_NAME_LENGTH, truncated);
3154			if (path != NULL) {
3155				kprintf(" path:          ");
3156				if (truncated)
3157					kputs("<truncated>/");
3158				kputs(path);
3159				kputs("\n");
3160			} else
3161				kprintf("Failed to resolve vnode path.\n");
3162
3163			debug_free(buffer);
3164		} else
3165			kprintf("Failed to allocate memory for constructing the path.\n");
3166	}
3167
3168	set_debug_variable("_node", (addr_t)vnode->private_node);
3169	set_debug_variable("_mount", (addr_t)vnode->mount);
3170	set_debug_variable("_covered_by", (addr_t)vnode->covered_by);
3171	set_debug_variable("_covers", (addr_t)vnode->covers);
3172	set_debug_variable("_adv_lock", (addr_t)vnode->advisory_locking);
3173}
3174
3175
3176static int
3177dump_mount(int argc, char** argv)
3178{
3179	if (argc != 2 || !strcmp(argv[1], "--help")) {
3180		kprintf("usage: %s [id|address]\n", argv[0]);
3181		return 0;
3182	}
3183
3184	ulong val = parse_expression(argv[1]);
3185	uint32 id = val;
3186
3187	struct fs_mount* mount = sMountsTable->Lookup(id);
3188	if (mount == NULL) {
3189		if (IS_USER_ADDRESS(id)) {
3190			kprintf("fs_mount not found\n");
3191			return 0;
3192		}
3193		mount = (fs_mount*)val;
3194	}
3195
3196	_dump_mount(mount);
3197	return 0;
3198}
3199
3200
3201static int
3202dump_mounts(int argc, char** argv)
3203{
3204	if (argc != 1) {
3205		kprintf("usage: %s\n", argv[0]);
3206		return 0;
3207	}
3208
3209	kprintf("%-*s    id %-*s   %-*s   %-*s   fs_name\n",
3210		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "root",
3211		B_PRINTF_POINTER_WIDTH, "covers", B_PRINTF_POINTER_WIDTH, "cookie");
3212
3213	struct fs_mount* mount;
3214
3215	MountTable::Iterator iterator(sMountsTable);
3216	while (iterator.HasNext()) {
3217		mount = iterator.Next();
3218		kprintf("%p%4" B_PRIdDEV " %p %p %p %s\n", mount, mount->id, mount->root_vnode,
3219			mount->root_vnode->covers, mount->volume->private_volume,
3220			mount->volume->file_system_name);
3221
3222		fs_volume* volume = mount->volume;
3223		while (volume->super_volume != NULL) {
3224			volume = volume->super_volume;
3225			kprintf("                                     %p %s\n",
3226				volume->private_volume, volume->file_system_name);
3227		}
3228	}
3229
3230	return 0;
3231}
3232
3233
3234static int
3235dump_vnode(int argc, char** argv)
3236{
3237	bool printPath = false;
3238	int argi = 1;
3239	if (argc >= 2 && strcmp(argv[argi], "-p") == 0) {
3240		printPath = true;
3241		argi++;
3242	}
3243
3244	if (argi >= argc || argi + 2 < argc) {
3245		print_debugger_command_usage(argv[0]);
3246		return 0;
3247	}
3248
3249	struct vnode* vnode = NULL;
3250
3251	if (argi + 1 == argc) {
3252		vnode = (struct vnode*)parse_expression(argv[argi]);
3253		if (IS_USER_ADDRESS(vnode)) {
3254			kprintf("invalid vnode address\n");
3255			return 0;
3256		}
3257		_dump_vnode(vnode, printPath);
3258		return 0;
3259	}
3260
3261	dev_t device = parse_expression(argv[argi]);
3262	ino_t id = parse_expression(argv[argi + 1]);
3263
3264	VnodeTable::Iterator iterator(sVnodeTable);
3265	while (iterator.HasNext()) {
3266		vnode = iterator.Next();
3267		if (vnode->id != id || vnode->device != device)
3268			continue;
3269
3270		_dump_vnode(vnode, printPath);
3271	}
3272
3273	return 0;
3274}
3275
3276
3277static int
3278dump_vnodes(int argc, char** argv)
3279{
3280	if (argc != 2 || !strcmp(argv[1], "--help")) {
3281		kprintf("usage: %s [device]\n", argv[0]);
3282		return 0;
3283	}
3284
3285	// restrict dumped nodes to a certain device if requested
3286	dev_t device = parse_expression(argv[1]);
3287
3288	struct vnode* vnode;
3289
3290	kprintf("%-*s   dev     inode  ref %-*s   %-*s   %-*s   flags\n",
3291		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache",
3292		B_PRINTF_POINTER_WIDTH, "fs-node", B_PRINTF_POINTER_WIDTH, "locking");
3293
3294	VnodeTable::Iterator iterator(sVnodeTable);
3295	while (iterator.HasNext()) {
3296		vnode = iterator.Next();
3297		if (vnode->device != device)
3298			continue;
3299
3300		kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO "%5" B_PRId32 " %p %p %p %s%s%s\n",
3301			vnode, vnode->device, vnode->id, vnode->ref_count, vnode->cache,
3302			vnode->private_node, vnode->advisory_locking,
3303			vnode->IsRemoved() ? "r" : "-", vnode->IsBusy() ? "b" : "-",
3304			vnode->IsUnpublished() ? "u" : "-");
3305	}
3306
3307	return 0;
3308}
3309
3310
3311static int
3312dump_vnode_caches(int argc, char** argv)
3313{
3314	struct vnode* vnode;
3315
3316	if (argc > 2 || !strcmp(argv[1], "--help")) {
3317		kprintf("usage: %s [device]\n", argv[0]);
3318		return 0;
3319	}
3320
3321	// restrict dumped nodes to a certain device if requested
3322	dev_t device = -1;
3323	if (argc > 1)
3324		device = parse_expression(argv[1]);
3325
3326	kprintf("%-*s   dev     inode %-*s       size   pages\n",
3327		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache");
3328
3329	VnodeTable::Iterator iterator(sVnodeTable);
3330	while (iterator.HasNext()) {
3331		vnode = iterator.Next();
3332		if (vnode->cache == NULL)
3333			continue;
3334		if (device != -1 && vnode->device != device)
3335			continue;
3336
3337		kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO " %p %8" B_PRIdOFF "%8" B_PRId32 "\n",
3338			vnode, vnode->device, vnode->id, vnode->cache,
3339			(vnode->cache->virtual_end + B_PAGE_SIZE - 1) / B_PAGE_SIZE,
3340			vnode->cache->page_count);
3341	}
3342
3343	return 0;
3344}
3345
3346
3347int
3348dump_io_context(int argc, char** argv)
3349{
3350	if (argc > 2 || !strcmp(argv[1], "--help")) {
3351		kprintf("usage: %s [team-id|address]\n", argv[0]);
3352		return 0;
3353	}
3354
3355	struct io_context* context = NULL;
3356
3357	if (argc > 1) {
3358		ulong num = parse_expression(argv[1]);
3359		if (IS_KERNEL_ADDRESS(num))
3360			context = (struct io_context*)num;
3361		else {
3362			Team* team = team_get_team_struct_locked(num);
3363			if (team == NULL) {
3364				kprintf("could not find team with ID %lu\n", num);
3365				return 0;
3366			}
3367			context = (struct io_context*)team->io_context;
3368		}
3369	} else
3370		context = get_current_io_context(true);
3371
3372	kprintf("I/O CONTEXT: %p\n", context);
3373	kprintf(" root vnode:\t%p\n", context->root);
3374	kprintf(" cwd vnode:\t%p\n", context->cwd);
3375	kprintf(" used fds:\t%" B_PRIu32 "\n", context->num_used_fds);
3376	kprintf(" max fds:\t%" B_PRIu32 "\n", context->table_size);
3377
3378	if (context->num_used_fds) {
3379		kprintf("   no.  type    %*s  ref  open  mode         pos    %*s\n",
3380			B_PRINTF_POINTER_WIDTH, "ops", B_PRINTF_POINTER_WIDTH, "cookie");
3381	}
3382
3383	for (uint32 i = 0; i < context->table_size; i++) {
3384		struct file_descriptor* fd = context->fds[i];
3385		if (fd == NULL)
3386			continue;
3387
3388		kprintf("  %3" B_PRIu32 ":  %4" B_PRId32 "  %p  %3" B_PRId32 "  %4"
3389			B_PRIu32 "  %4" B_PRIx32 "  %10" B_PRIdOFF "  %p  %s %p\n", i,
3390			fd->type, fd->ops, fd->ref_count, fd->open_count, fd->open_mode,
3391			fd->pos, fd->cookie,
3392			fd->type >= FDTYPE_INDEX && fd->type <= FDTYPE_QUERY
3393				? "mount" : "vnode",
3394			fd->u.vnode);
3395	}
3396
3397	kprintf(" used monitors:\t%" B_PRIu32 "\n", context->num_monitors);
3398	kprintf(" max monitors:\t%" B_PRIu32 "\n", context->max_monitors);
3399
3400	set_debug_variable("_cwd", (addr_t)context->cwd);
3401
3402	return 0;
3403}
3404
3405
3406int
3407dump_vnode_usage(int argc, char** argv)
3408{
3409	if (argc != 1) {
3410		kprintf("usage: %s\n", argv[0]);
3411		return 0;
3412	}
3413
3414	kprintf("Unused vnodes: %" B_PRIu32 " (max unused %" B_PRIu32 ")\n",
3415		sUnusedVnodes, kMaxUnusedVnodes);
3416
3417	uint32 count = sVnodeTable->CountElements();
3418
3419	kprintf("%" B_PRIu32 " vnodes total (%" B_PRIu32 " in use).\n", count,
3420		count - sUnusedVnodes);
3421	return 0;
3422}
3423
3424#endif	// ADD_DEBUGGER_COMMANDS
3425
3426
3427/*!	Clears memory specified by an iovec array.
3428*/
3429static void
3430zero_iovecs(const iovec* vecs, size_t vecCount, size_t bytes)
3431{
3432	for (size_t i = 0; i < vecCount && bytes > 0; i++) {
3433		size_t length = std::min(vecs[i].iov_len, bytes);
3434		memset(vecs[i].iov_base, 0, length);
3435		bytes -= length;
3436	}
3437}
3438
3439
3440/*!	Does the dirty work of combining the file_io_vecs with the iovecs
3441	and calls the file system hooks to read/write the request to disk.
3442*/
3443static status_t
3444common_file_io_vec_pages(struct vnode* vnode, void* cookie,
3445	const file_io_vec* fileVecs, size_t fileVecCount, const iovec* vecs,
3446	size_t vecCount, uint32* _vecIndex, size_t* _vecOffset, size_t* _numBytes,
3447	bool doWrite)
3448{
3449	if (fileVecCount == 0) {
3450		// There are no file vecs at this offset, so we're obviously trying
3451		// to access the file outside of its bounds
3452		return B_BAD_VALUE;
3453	}
3454
3455	size_t numBytes = *_numBytes;
3456	uint32 fileVecIndex;
3457	size_t vecOffset = *_vecOffset;
3458	uint32 vecIndex = *_vecIndex;
3459	status_t status;
3460	size_t size;
3461
3462	if (!doWrite && vecOffset == 0) {
3463		// now directly read the data from the device
3464		// the first file_io_vec can be read directly
3465		// TODO: we could also write directly
3466
3467		if (fileVecs[0].length < (off_t)numBytes)
3468			size = fileVecs[0].length;
3469		else
3470			size = numBytes;
3471
3472		if (fileVecs[0].offset >= 0) {
3473			status = FS_CALL(vnode, read_pages, cookie, fileVecs[0].offset,
3474				&vecs[vecIndex], vecCount - vecIndex, &size);
3475		} else {
3476			// sparse read
3477			zero_iovecs(&vecs[vecIndex], vecCount - vecIndex, size);
3478			status = B_OK;
3479		}
3480		if (status != B_OK)
3481			return status;
3482
3483		ASSERT((off_t)size <= fileVecs[0].length);
3484
3485		// If the file portion was contiguous, we're already done now
3486		if (size == numBytes)
3487			return B_OK;
3488
3489		// if we reached the end of the file, we can return as well
3490		if ((off_t)size != fileVecs[0].length) {
3491			*_numBytes = size;
3492			return B_OK;
3493		}
3494
3495		fileVecIndex = 1;
3496
3497		// first, find out where we have to continue in our iovecs
3498		for (; vecIndex < vecCount; vecIndex++) {
3499			if (size < vecs[vecIndex].iov_len)
3500				break;
3501
3502			size -= vecs[vecIndex].iov_len;
3503		}
3504
3505		vecOffset = size;
3506	} else {
3507		fileVecIndex = 0;
3508		size = 0;
3509	}
3510
3511	// Too bad, let's process the rest of the file_io_vecs
3512
3513	size_t totalSize = size;
3514	size_t bytesLeft = numBytes - size;
3515
3516	for (; fileVecIndex < fileVecCount; fileVecIndex++) {
3517		const file_io_vec &fileVec = fileVecs[fileVecIndex];
3518		off_t fileOffset = fileVec.offset;
3519		off_t fileLeft = min_c(fileVec.length, (off_t)bytesLeft);
3520
3521		TRACE(("FILE VEC [%" B_PRIu32 "] length %" B_PRIdOFF "\n", fileVecIndex,
3522			fileLeft));
3523
3524		// process the complete fileVec
3525		while (fileLeft > 0) {
3526			iovec tempVecs[MAX_TEMP_IO_VECS];
3527			uint32 tempCount = 0;
3528
3529			// size tracks how much of what is left of the current fileVec
3530			// (fileLeft) has been assigned to tempVecs
3531			size = 0;
3532
3533			// assign what is left of the current fileVec to the tempVecs
3534			for (size = 0; (off_t)size < fileLeft && vecIndex < vecCount
3535					&& tempCount < MAX_TEMP_IO_VECS;) {
3536				// try to satisfy one iovec per iteration (or as much as
3537				// possible)
3538
3539				// bytes left of the current iovec
3540				size_t vecLeft = vecs[vecIndex].iov_len - vecOffset;
3541				if (vecLeft == 0) {
3542					vecOffset = 0;
3543					vecIndex++;
3544					continue;
3545				}
3546
3547				TRACE(("fill vec %" B_PRIu32 ", offset = %lu, size = %lu\n",
3548					vecIndex, vecOffset, size));
3549
3550				// actually available bytes
3551				size_t tempVecSize = min_c(vecLeft, fileLeft - size);
3552
3553				tempVecs[tempCount].iov_base
3554					= (void*)((addr_t)vecs[vecIndex].iov_base + vecOffset);
3555				tempVecs[tempCount].iov_len = tempVecSize;
3556				tempCount++;
3557
3558				size += tempVecSize;
3559				vecOffset += tempVecSize;
3560			}
3561
3562			size_t bytes = size;
3563
3564			if (fileOffset == -1) {
3565				if (doWrite) {
3566					panic("sparse write attempt: vnode %p", vnode);
3567					status = B_IO_ERROR;
3568				} else {
3569					// sparse read
3570					zero_iovecs(tempVecs, tempCount, bytes);
3571					status = B_OK;
3572				}
3573			} else if (doWrite) {
3574				status = FS_CALL(vnode, write_pages, cookie, fileOffset,
3575					tempVecs, tempCount, &bytes);
3576			} else {
3577				status = FS_CALL(vnode, read_pages, cookie, fileOffset,
3578					tempVecs, tempCount, &bytes);
3579			}
3580			if (status != B_OK)
3581				return status;
3582
3583			totalSize += bytes;
3584			bytesLeft -= size;
3585			if (fileOffset >= 0)
3586				fileOffset += size;
3587			fileLeft -= size;
3588			//dprintf("-> file left = %Lu\n", fileLeft);
3589
3590			if (size != bytes || vecIndex >= vecCount) {
3591				// there are no more bytes or iovecs, let's bail out
3592				*_numBytes = totalSize;
3593				return B_OK;
3594			}
3595		}
3596	}
3597
3598	*_vecIndex = vecIndex;
3599	*_vecOffset = vecOffset;
3600	*_numBytes = totalSize;
3601	return B_OK;
3602}
3603
3604
3605static bool
3606is_user_in_group(gid_t gid)
3607{
3608	if (gid == getegid())
3609		return true;
3610
3611	gid_t groups[NGROUPS_MAX];
3612	int groupCount = getgroups(NGROUPS_MAX, groups);
3613	for (int i = 0; i < groupCount; i++) {
3614		if (gid == groups[i])
3615			return true;
3616	}
3617
3618	return false;
3619}
3620
3621
3622static status_t
3623free_io_context(io_context* context)
3624{
3625	uint32 i;
3626
3627	TIOC(FreeIOContext(context));
3628
3629	if (context->root)
3630		put_vnode(context->root);
3631
3632	if (context->cwd)
3633		put_vnode(context->cwd);
3634
3635	mutex_lock(&context->io_mutex);
3636
3637	for (i = 0; i < context->table_size; i++) {
3638		if (struct file_descriptor* descriptor = context->fds[i]) {
3639			close_fd(context, descriptor);
3640			put_fd(descriptor);
3641		}
3642	}
3643
3644	mutex_destroy(&context->io_mutex);
3645
3646	remove_node_monitors(context);
3647	free(context->fds);
3648	free(context);
3649
3650	return B_OK;
3651}
3652
3653
3654static status_t
3655resize_monitor_table(struct io_context* context, const int newSize)
3656{
3657	int	status = B_OK;
3658
3659	if (newSize <= 0 || newSize > MAX_NODE_MONITORS)
3660		return B_BAD_VALUE;
3661
3662	mutex_lock(&context->io_mutex);
3663
3664	if ((size_t)newSize < context->num_monitors) {
3665		status = B_BUSY;
3666		goto out;
3667	}
3668	context->max_monitors = newSize;
3669
3670out:
3671	mutex_unlock(&context->io_mutex);
3672	return status;
3673}
3674
3675
3676//	#pragma mark - public API for file systems
3677
3678
3679extern "C" status_t
3680new_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3681	fs_vnode_ops* ops)
3682{
3683	FUNCTION(("new_vnode(volume = %p (%" B_PRId32 "), vnodeID = %" B_PRId64
3684		", node = %p)\n", volume, volume->id, vnodeID, privateNode));
3685
3686	if (privateNode == NULL)
3687		return B_BAD_VALUE;
3688
3689	int32 tries = BUSY_VNODE_RETRIES;
3690restart:
3691	// create the node
3692	bool nodeCreated;
3693	struct vnode* vnode;
3694	status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3695		nodeCreated);
3696	if (status != B_OK)
3697		return status;
3698
3699	WriteLocker nodeLocker(sVnodeLock, true);
3700		// create_new_vnode_and_lock() has locked for us
3701
3702	if (!nodeCreated && vnode->IsBusy()) {
3703		nodeLocker.Unlock();
3704		if (!retry_busy_vnode(tries, volume->id, vnodeID))
3705			return B_BUSY;
3706		goto restart;
3707	}
3708
3709	// file system integrity check:
3710	// test if the vnode already exists and bail out if this is the case!
3711	if (!nodeCreated) {
3712		panic("vnode %" B_PRIdDEV ":%" B_PRIdINO " already exists (node = %p, "
3713			"vnode->node = %p)!", volume->id, vnodeID, privateNode,
3714			vnode->private_node);
3715		return B_ERROR;
3716	}
3717
3718	vnode->private_node = privateNode;
3719	vnode->ops = ops;
3720	vnode->SetUnpublished(true);
3721
3722	TRACE(("returns: %s\n", strerror(status)));
3723
3724	return status;
3725}
3726
3727
3728extern "C" status_t
3729publish_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3730	fs_vnode_ops* ops, int type, uint32 flags)
3731{
3732	FUNCTION(("publish_vnode()\n"));
3733
3734	int32 tries = BUSY_VNODE_RETRIES;
3735restart:
3736	WriteLocker locker(sVnodeLock);
3737
3738	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3739
3740	bool nodeCreated = false;
3741	if (vnode == NULL) {
3742		if (privateNode == NULL)
3743			return B_BAD_VALUE;
3744
3745		// create the node
3746		locker.Unlock();
3747			// create_new_vnode_and_lock() will re-lock for us on success
3748		status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3749			nodeCreated);
3750		if (status != B_OK)
3751			return status;
3752
3753		locker.SetTo(sVnodeLock, true);
3754	}
3755
3756	if (nodeCreated) {
3757		vnode->private_node = privateNode;
3758		vnode->ops = ops;
3759		vnode->SetUnpublished(true);
3760	} else if (vnode->IsBusy() && vnode->IsUnpublished()
3761		&& vnode->private_node == privateNode && vnode->ops == ops) {
3762		// already known, but not published
3763	} else if (vnode->IsBusy()) {
3764		locker.Unlock();
3765		if (!retry_busy_vnode(tries, volume->id, vnodeID))
3766			return B_BUSY;
3767		goto restart;
3768	} else
3769		return B_BAD_VALUE;
3770
3771	bool publishSpecialSubNode = false;
3772
3773	vnode->SetType(type);
3774	vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
3775	publishSpecialSubNode = is_special_node_type(type)
3776		&& (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
3777
3778	status_t status = B_OK;
3779
3780	// create sub vnodes, if necessary
3781	if (volume->sub_volume != NULL || publishSpecialSubNode) {
3782		locker.Unlock();
3783
3784		fs_volume* subVolume = volume;
3785		if (volume->sub_volume != NULL) {
3786			while (status == B_OK && subVolume->sub_volume != NULL) {
3787				subVolume = subVolume->sub_volume;
3788				status = subVolume->ops->create_sub_vnode(subVolume, vnodeID,
3789					vnode);
3790			}
3791		}
3792
3793		if (status == B_OK && publishSpecialSubNode)
3794			status = create_special_sub_node(vnode, flags);
3795
3796		if (status != B_OK) {
3797			// error -- clean up the created sub vnodes
3798			while (subVolume->super_volume != volume) {
3799				subVolume = subVolume->super_volume;
3800				subVolume->ops->delete_sub_vnode(subVolume, vnode);
3801			}
3802		}
3803
3804		if (status == B_OK) {
3805			ReadLocker vnodesReadLocker(sVnodeLock);
3806			AutoLocker<Vnode> nodeLocker(vnode);
3807			vnode->SetBusy(false);
3808			vnode->SetUnpublished(false);
3809		} else {
3810			locker.Lock();
3811			sVnodeTable->Remove(vnode);
3812			remove_vnode_from_mount_list(vnode, vnode->mount);
3813			object_cache_free(sVnodeCache, vnode, 0);
3814		}
3815	} else {
3816		// we still hold the write lock -- mark the node unbusy and published
3817		vnode->SetBusy(false);
3818		vnode->SetUnpublished(false);
3819	}
3820
3821	TRACE(("returns: %s\n", strerror(status)));
3822
3823	return status;
3824}
3825
3826
3827extern "C" status_t
3828get_vnode(fs_volume* volume, ino_t vnodeID, void** _privateNode)
3829{
3830	struct vnode* vnode;
3831
3832	if (volume == NULL)
3833		return B_BAD_VALUE;
3834
3835	status_t status = get_vnode(volume->id, vnodeID, &vnode, true, true);
3836	if (status != B_OK)
3837		return status;
3838
3839	// If this is a layered FS, we need to get the node cookie for the requested
3840	// layer.
3841	if (HAS_FS_CALL(vnode, get_super_vnode)) {
3842		fs_vnode resolvedNode;
3843		status_t status = FS_CALL(vnode, get_super_vnode, volume,
3844			&resolvedNode);
3845		if (status != B_OK) {
3846			panic("get_vnode(): Failed to get super node for vnode %p, "
3847				"volume: %p", vnode, volume);
3848			put_vnode(vnode);
3849			return status;
3850		}
3851
3852		if (_privateNode != NULL)
3853			*_privateNode = resolvedNode.private_node;
3854	} else if (_privateNode != NULL)
3855		*_privateNode = vnode->private_node;
3856
3857	return B_OK;
3858}
3859
3860
3861extern "C" status_t
3862acquire_vnode(fs_volume* volume, ino_t vnodeID)
3863{
3864	ReadLocker nodeLocker(sVnodeLock);
3865
3866	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3867	if (vnode == NULL)
3868		return B_BAD_VALUE;
3869
3870	inc_vnode_ref_count(vnode);
3871	return B_OK;
3872}
3873
3874
3875extern "C" status_t
3876put_vnode(fs_volume* volume, ino_t vnodeID)
3877{
3878	struct vnode* vnode;
3879
3880	rw_lock_read_lock(&sVnodeLock);
3881	vnode = lookup_vnode(volume->id, vnodeID);
3882	rw_lock_read_unlock(&sVnodeLock);
3883
3884	if (vnode == NULL)
3885		return B_BAD_VALUE;
3886
3887	dec_vnode_ref_count(vnode, false, true);
3888	return B_OK;
3889}
3890
3891
3892extern "C" status_t
3893remove_vnode(fs_volume* volume, ino_t vnodeID)
3894{
3895	ReadLocker locker(sVnodeLock);
3896
3897	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3898	if (vnode == NULL)
3899		return B_ENTRY_NOT_FOUND;
3900
3901	if (vnode->covered_by != NULL || vnode->covers != NULL) {
3902		// this vnode is in use
3903		return B_BUSY;
3904	}
3905
3906	vnode->Lock();
3907
3908	vnode->SetRemoved(true);
3909	bool removeUnpublished = false;
3910
3911	if (vnode->IsUnpublished()) {
3912		// prepare the vnode for deletion
3913		removeUnpublished = true;
3914		vnode->SetBusy(true);
3915	}
3916
3917	vnode->Unlock();
3918	locker.Unlock();
3919
3920	if (removeUnpublished) {
3921		// If the vnode hasn't been published yet, we delete it here
3922		atomic_add(&vnode->ref_count, -1);
3923		free_vnode(vnode, true);
3924	}
3925
3926	return B_OK;
3927}
3928
3929
3930extern "C" status_t
3931unremove_vnode(fs_volume* volume, ino_t vnodeID)
3932{
3933	struct vnode* vnode;
3934
3935	rw_lock_read_lock(&sVnodeLock);
3936
3937	vnode = lookup_vnode(volume->id, vnodeID);
3938	if (vnode) {
3939		AutoLocker<Vnode> nodeLocker(vnode);
3940		vnode->SetRemoved(false);
3941	}
3942
3943	rw_lock_read_unlock(&sVnodeLock);
3944	return B_OK;
3945}
3946
3947
3948extern "C" status_t
3949get_vnode_removed(fs_volume* volume, ino_t vnodeID, bool* _removed)
3950{
3951	ReadLocker _(sVnodeLock);
3952
3953	if (struct vnode* vnode = lookup_vnode(volume->id, vnodeID)) {
3954		if (_removed != NULL)
3955			*_removed = vnode->IsRemoved();
3956		return B_OK;
3957	}
3958
3959	return B_BAD_VALUE;
3960}
3961
3962
3963extern "C" fs_volume*
3964volume_for_vnode(fs_vnode* _vnode)
3965{
3966	if (_vnode == NULL)
3967		return NULL;
3968
3969	struct vnode* vnode = static_cast<struct vnode*>(_vnode);
3970	return vnode->mount->volume;
3971}
3972
3973
3974extern "C" status_t
3975check_access_permissions(int accessMode, mode_t mode, gid_t nodeGroupID,
3976	uid_t nodeUserID)
3977{
3978	// get node permissions
3979	int userPermissions = (mode & S_IRWXU) >> 6;
3980	int groupPermissions = (mode & S_IRWXG) >> 3;
3981	int otherPermissions = mode & S_IRWXO;
3982
3983	// get the node permissions for this uid/gid
3984	int permissions = 0;
3985	uid_t uid = geteuid();
3986
3987	if (uid == 0) {
3988		// user is root
3989		// root has always read/write permission, but at least one of the
3990		// X bits must be set for execute permission
3991		permissions = userPermissions | groupPermissions | otherPermissions
3992			| S_IROTH | S_IWOTH;
3993		if (S_ISDIR(mode))
3994			permissions |= S_IXOTH;
3995	} else if (uid == nodeUserID) {
3996		// user is node owner
3997		permissions = userPermissions;
3998	} else if (is_user_in_group(nodeGroupID)) {
3999		// user is in owning group
4000		permissions = groupPermissions;
4001	} else {
4002		// user is one of the others
4003		permissions = otherPermissions;
4004	}
4005
4006	return (accessMode & ~permissions) == 0 ? B_OK : B_PERMISSION_DENIED;
4007}
4008
4009
4010#if 0
4011extern "C" status_t
4012read_pages(int fd, off_t pos, const iovec* vecs, size_t count,
4013	size_t* _numBytes)
4014{
4015	struct file_descriptor* descriptor;
4016	struct vnode* vnode;
4017
4018	descriptor = get_fd_and_vnode(fd, &vnode, true);
4019	if (descriptor == NULL)
4020		return B_FILE_ERROR;
4021
4022	status_t status = vfs_read_pages(vnode, descriptor->cookie, pos, vecs,
4023		count, 0, _numBytes);
4024
4025	put_fd(descriptor);
4026	return status;
4027}
4028
4029
4030extern "C" status_t
4031write_pages(int fd, off_t pos, const iovec* vecs, size_t count,
4032	size_t* _numBytes)
4033{
4034	struct file_descriptor* descriptor;
4035	struct vnode* vnode;
4036
4037	descriptor = get_fd_and_vnode(fd, &vnode, true);
4038	if (descriptor == NULL)
4039		return B_FILE_ERROR;
4040
4041	status_t status = vfs_write_pages(vnode, descriptor->cookie, pos, vecs,
4042		count, 0, _numBytes);
4043
4044	put_fd(descriptor);
4045	return status;
4046}
4047#endif
4048
4049
4050extern "C" status_t
4051read_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
4052	const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
4053	size_t* _bytes)
4054{
4055	struct vnode* vnode;
4056	FileDescriptorPutter descriptor(get_fd_and_vnode(fd, &vnode, true));
4057	if (!descriptor.IsSet())
4058		return B_FILE_ERROR;
4059
4060	status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
4061		fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
4062		false);
4063
4064	return status;
4065}
4066
4067
4068extern "C" status_t
4069write_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
4070	const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
4071	size_t* _bytes)
4072{
4073	struct vnode* vnode;
4074	FileDescriptorPutter descriptor(get_fd_and_vnode(fd, &vnode, true));
4075	if (!descriptor.IsSet())
4076		return B_FILE_ERROR;
4077
4078	status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
4079		fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
4080		true);
4081
4082	return status;
4083}
4084
4085
4086extern "C" status_t
4087entry_cache_add(dev_t mountID, ino_t dirID, const char* name, ino_t nodeID)
4088{
4089	// lookup mount -- the caller is required to make sure that the mount
4090	// won't go away
4091	ReadLocker locker(sMountLock);
4092	struct fs_mount* mount = find_mount(mountID);
4093	if (mount == NULL)
4094		return B_BAD_VALUE;
4095	locker.Unlock();
4096
4097	return mount->entry_cache.Add(dirID, name, nodeID, false);
4098}
4099
4100
4101extern "C" status_t
4102entry_cache_add_missing(dev_t mountID, ino_t dirID, const char* name)
4103{
4104	// lookup mount -- the caller is required to make sure that the mount
4105	// won't go away
4106	ReadLocker locker(sMountLock);
4107	struct fs_mount* mount = find_mount(mountID);
4108	if (mount == NULL)
4109		return B_BAD_VALUE;
4110	locker.Unlock();
4111
4112	return mount->entry_cache.Add(dirID, name, -1, true);
4113}
4114
4115
4116extern "C" status_t
4117entry_cache_remove(dev_t mountID, ino_t dirID, const char* name)
4118{
4119	// lookup mount -- the caller is required to make sure that the mount
4120	// won't go away
4121	ReadLocker locker(sMountLock);
4122	struct fs_mount* mount = find_mount(mountID);
4123	if (mount == NULL)
4124		return B_BAD_VALUE;
4125	locker.Unlock();
4126
4127	return mount->entry_cache.Remove(dirID, name);
4128}
4129
4130
4131//	#pragma mark - private VFS API
4132//	Functions the VFS exports for other parts of the kernel
4133
4134
4135/*! Acquires another reference to the vnode that has to be released
4136	by calling vfs_put_vnode().
4137*/
4138void
4139vfs_acquire_vnode(struct vnode* vnode)
4140{
4141	inc_vnode_ref_count(vnode);
4142}
4143
4144
4145/*! This is currently called from file_cache_create() only.
4146	It's probably a temporary solution as long as devfs requires that
4147	fs_read_pages()/fs_write_pages() are called with the standard
4148	open cookie and not with a device cookie.
4149	If that's done differently, remove this call; it has no other
4150	purpose.
4151*/
4152extern "C" status_t
4153vfs_get_cookie_from_fd(int fd, void** _cookie)
4154{
4155	struct file_descriptor* descriptor;
4156
4157	descriptor = get_fd(get_current_io_context(true), fd);
4158	if (descriptor == NULL)
4159		return B_FILE_ERROR;
4160
4161	*_cookie = descriptor->cookie;
4162	return B_OK;
4163}
4164
4165
4166extern "C" status_t
4167vfs_get_vnode_from_fd(int fd, bool kernel, struct vnode** vnode)
4168{
4169	*vnode = get_vnode_from_fd(fd, kernel);
4170
4171	if (*vnode == NULL)
4172		return B_FILE_ERROR;
4173
4174	return B_NO_ERROR;
4175}
4176
4177
4178extern "C" status_t
4179vfs_get_vnode_from_path(const char* path, bool kernel, struct vnode** _vnode)
4180{
4181	TRACE(("vfs_get_vnode_from_path: entry. path = '%s', kernel %d\n",
4182		path, kernel));
4183
4184	KPath pathBuffer;
4185	if (pathBuffer.InitCheck() != B_OK)
4186		return B_NO_MEMORY;
4187
4188	char* buffer = pathBuffer.LockBuffer();
4189	strlcpy(buffer, path, pathBuffer.BufferSize());
4190
4191	VnodePutter vnode;
4192	status_t status = path_to_vnode(buffer, true, vnode, NULL, kernel);
4193	if (status != B_OK)
4194		return status;
4195
4196	*_vnode = vnode.Detach();
4197	return B_OK;
4198}
4199
4200
4201extern "C" status_t
4202vfs_get_vnode(dev_t mountID, ino_t vnodeID, bool canWait, struct vnode** _vnode)
4203{
4204	struct vnode* vnode = NULL;
4205
4206	status_t status = get_vnode(mountID, vnodeID, &vnode, canWait, false);
4207	if (status != B_OK)
4208		return status;
4209
4210	*_vnode = vnode;
4211	return B_OK;
4212}
4213
4214
4215extern "C" status_t
4216vfs_entry_ref_to_vnode(dev_t mountID, ino_t directoryID,
4217	const char* name, struct vnode** _vnode)
4218{
4219	VnodePutter vnode;
4220	status_t status = entry_ref_to_vnode(mountID, directoryID, name, false, true, vnode);
4221	*_vnode = vnode.Detach();
4222	return status;
4223}
4224
4225
4226extern "C" void
4227vfs_vnode_to_node_ref(struct vnode* vnode, dev_t* _mountID, ino_t* _vnodeID)
4228{
4229	*_mountID = vnode->device;
4230	*_vnodeID = vnode->id;
4231}
4232
4233
4234/*!
4235	Helper function abstracting the process of "converting" a given
4236	vnode-pointer to a fs_vnode-pointer.
4237	Currently only used in bindfs.
4238*/
4239extern "C" fs_vnode*
4240vfs_fsnode_for_vnode(struct vnode* vnode)
4241{
4242	return vnode;
4243}
4244
4245
4246/*!
4247	Calls fs_open() on the given vnode and returns a new
4248	file descriptor for it
4249*/
4250int
4251vfs_open_vnode(struct vnode* vnode, int openMode, bool kernel)
4252{
4253	return open_vnode(vnode, openMode, kernel);
4254}
4255
4256
4257/*!	Looks up a vnode with the given mount and vnode ID.
4258	Must only be used with "in-use" vnodes as it doesn't grab a reference
4259	to the node.
4260	It's currently only be used by file_cache_create().
4261*/
4262extern "C" status_t
4263vfs_lookup_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode)
4264{
4265	rw_lock_read_lock(&sVnodeLock);
4266	struct vnode* vnode = lookup_vnode(mountID, vnodeID);
4267	rw_lock_read_unlock(&sVnodeLock);
4268
4269	if (vnode == NULL)
4270		return B_ERROR;
4271
4272	*_vnode = vnode;
4273	return B_OK;
4274}
4275
4276
4277extern "C" status_t
4278vfs_get_fs_node_from_path(fs_volume* volume, const char* path,
4279	bool traverseLeafLink, bool kernel, void** _node)
4280{
4281	TRACE(("vfs_get_fs_node_from_path(volume = %p, path = \"%s\", kernel %d)\n",
4282		volume, path, kernel));
4283
4284	KPath pathBuffer;
4285	if (pathBuffer.InitCheck() != B_OK)
4286		return B_NO_MEMORY;
4287
4288	fs_mount* mount;
4289	status_t status = get_mount(volume->id, &mount);
4290	if (status != B_OK)
4291		return status;
4292
4293	char* buffer = pathBuffer.LockBuffer();
4294	strlcpy(buffer, path, pathBuffer.BufferSize());
4295
4296	VnodePutter vnode;
4297
4298	if (buffer[0] == '/')
4299		status = path_to_vnode(buffer, traverseLeafLink, vnode, NULL, kernel);
4300	else {
4301		inc_vnode_ref_count(mount->root_vnode);
4302			// vnode_path_to_vnode() releases a reference to the starting vnode
4303		status = vnode_path_to_vnode(mount->root_vnode, buffer, traverseLeafLink,
4304			kernel, vnode, NULL);
4305	}
4306
4307	put_mount(mount);
4308
4309	if (status != B_OK)
4310		return status;
4311
4312	if (vnode->device != volume->id) {
4313		// wrong mount ID - must not gain access on foreign file system nodes
4314		return B_BAD_VALUE;
4315	}
4316
4317	// Use get_vnode() to resolve the cookie for the right layer.
4318	status = get_vnode(volume, vnode->id, _node);
4319
4320	return status;
4321}
4322
4323
4324status_t
4325vfs_read_stat(int fd, const char* path, bool traverseLeafLink,
4326	struct stat* stat, bool kernel)
4327{
4328	status_t status;
4329
4330	if (path != NULL) {
4331		// path given: get the stat of the node referred to by (fd, path)
4332		KPath pathBuffer(path);
4333		if (pathBuffer.InitCheck() != B_OK)
4334			return B_NO_MEMORY;
4335
4336		status = common_path_read_stat(fd, pathBuffer.LockBuffer(),
4337			traverseLeafLink, stat, kernel);
4338	} else {
4339		// no path given: get the FD and use the FD operation
4340		FileDescriptorPutter descriptor
4341			(get_fd(get_current_io_context(kernel), fd));
4342		if (!descriptor.IsSet())
4343			return B_FILE_ERROR;
4344
4345		if (descriptor->ops->fd_read_stat)
4346			status = descriptor->ops->fd_read_stat(descriptor.Get(), stat);
4347		else
4348			status = B_UNSUPPORTED;
4349	}
4350
4351	return status;
4352}
4353
4354
4355/*!	Finds the full path to the file that contains the module \a moduleName,
4356	puts it into \a pathBuffer, and returns B_OK for success.
4357	If \a pathBuffer was too small, it returns \c B_BUFFER_OVERFLOW,
4358	\c B_ENTRY_NOT_FOUNT if no file could be found.
4359	\a pathBuffer is clobbered in any case and must not be relied on if this
4360	functions returns unsuccessfully.
4361	\a basePath and \a pathBuffer must not point to the same space.
4362*/
4363status_t
4364vfs_get_module_path(const char* basePath, const char* moduleName,
4365	char* pathBuffer, size_t bufferSize)
4366{
4367	status_t status;
4368	size_t length;
4369	char* path;
4370
4371	if (bufferSize == 0
4372		|| strlcpy(pathBuffer, basePath, bufferSize) >= bufferSize)
4373		return B_BUFFER_OVERFLOW;
4374
4375	VnodePutter dir;
4376	status = path_to_vnode(pathBuffer, true, dir, NULL, true);
4377	if (status != B_OK)
4378		return status;
4379
4380	// the path buffer had been clobbered by the above call
4381	length = strlcpy(pathBuffer, basePath, bufferSize);
4382	if (pathBuffer[length - 1] != '/')
4383		pathBuffer[length++] = '/';
4384
4385	path = pathBuffer + length;
4386	bufferSize -= length;
4387
4388	VnodePutter file;
4389	while (moduleName) {
4390		char* nextPath = strchr(moduleName, '/');
4391		if (nextPath == NULL)
4392			length = strlen(moduleName);
4393		else {
4394			length = nextPath - moduleName;
4395			nextPath++;
4396		}
4397
4398		if (length + 1 >= bufferSize)
4399			return B_BUFFER_OVERFLOW;
4400
4401		memcpy(path, moduleName, length);
4402		path[length] = '\0';
4403		moduleName = nextPath;
4404
4405		// vnode_path_to_vnode() assumes ownership of the passed dir
4406		status = vnode_path_to_vnode(dir.Detach(), path, true, true, file, NULL);
4407		if (status != B_OK)
4408			return status;
4409
4410		if (S_ISDIR(file->Type())) {
4411			// goto the next directory
4412			path[length] = '/';
4413			path[length + 1] = '\0';
4414			path += length + 1;
4415			bufferSize -= length + 1;
4416
4417			dir.SetTo(file.Detach());
4418		} else if (S_ISREG(file->Type())) {
4419			// it's a file so it should be what we've searched for
4420			return B_OK;
4421		} else {
4422			TRACE(("vfs_get_module_path(): something is strange here: "
4423				"0x%08" B_PRIx32 "...\n", file->Type()));
4424			return B_ERROR;
4425		}
4426	}
4427
4428	// if we got here, the moduleName just pointed to a directory, not to
4429	// a real module - what should we do in this case?
4430	return B_ENTRY_NOT_FOUND;
4431}
4432
4433
4434/*!	\brief Normalizes a given path.
4435
4436	The path must refer to an existing or non-existing entry in an existing
4437	directory, that is chopping off the leaf component the remaining path must
4438	refer to an existing directory.
4439
4440	The returned will be canonical in that it will be absolute, will not
4441	contain any "." or ".." components or duplicate occurrences of '/'s,
4442	and none of the directory components will by symbolic links.
4443
4444	Any two paths referring to the same entry, will result in the same
4445	normalized path (well, that is pretty much the definition of `normalized',
4446	isn't it :-).
4447
4448	\param path The path to be normalized.
4449	\param buffer The buffer into which the normalized path will be written.
4450		   May be the same one as \a path.
4451	\param bufferSize The size of \a buffer.
4452	\param traverseLink If \c true, the function also resolves leaf symlinks.
4453	\param kernel \c true, if the IO context of the kernel shall be used,
4454		   otherwise that of the team this thread belongs to. Only relevant,
4455		   if the path is relative (to get the CWD).
4456	\return \c B_OK if everything went fine, another error code otherwise.
4457*/
4458status_t
4459vfs_normalize_path(const char* path, char* buffer, size_t bufferSize,
4460	bool traverseLink, bool kernel)
4461{
4462	if (!path || !buffer || bufferSize < 1)
4463		return B_BAD_VALUE;
4464
4465	if (path != buffer) {
4466		if (strlcpy(buffer, path, bufferSize) >= bufferSize)
4467			return B_BUFFER_OVERFLOW;
4468	}
4469
4470	return normalize_path(buffer, bufferSize, traverseLink, kernel);
4471}
4472
4473
4474/*!	\brief Gets the parent of the passed in node.
4475
4476	Gets the parent of the passed in node, and correctly resolves covered
4477	nodes.
4478*/
4479extern "C" status_t
4480vfs_resolve_parent(struct vnode* parent, dev_t* device, ino_t* node)
4481{
4482	return resolve_covered_parent(parent, device, node,
4483		get_current_io_context(true));
4484}
4485
4486
4487/*!	\brief Creates a special node in the file system.
4488
4489	The caller gets a reference to the newly created node (which is passed
4490	back through \a _createdVnode) and is responsible for releasing it.
4491
4492	\param path The path where to create the entry for the node. Can be \c NULL,
4493		in which case the node is created without an entry in the root FS -- it
4494		will automatically be deleted when the last reference has been released.
4495	\param subVnode The definition of the subnode. Can be \c NULL, in which case
4496		the target file system will just create the node with its standard
4497		operations. Depending on the type of the node a subnode might be created
4498		automatically, though.
4499	\param mode The type and permissions for the node to be created.
4500	\param flags Flags to be passed to the creating FS.
4501	\param kernel \c true, if called in the kernel context (relevant only if
4502		\a path is not \c NULL and not absolute).
4503	\param _superVnode Pointer to a pre-allocated structure to be filled by the
4504		file system creating the node, with the private data pointer and
4505		operations for the super node. Can be \c NULL.
4506	\param _createVnode Pointer to pre-allocated storage where to store the
4507		pointer to the newly created node.
4508	\return \c B_OK, if everything went fine, another error code otherwise.
4509*/
4510status_t
4511vfs_create_special_node(const char* path, fs_vnode* subVnode, mode_t mode,
4512	uint32 flags, bool kernel, fs_vnode* _superVnode,
4513	struct vnode** _createdVnode)
4514{
4515	VnodePutter dirNode;
4516	char _leaf[B_FILE_NAME_LENGTH];
4517	char* leaf = NULL;
4518
4519	if (path) {
4520		// We've got a path. Get the dir vnode and the leaf name.
4521		KPath tmpPathBuffer;
4522		if (tmpPathBuffer.InitCheck() != B_OK)
4523			return B_NO_MEMORY;
4524
4525		char* tmpPath = tmpPathBuffer.LockBuffer();
4526		if (strlcpy(tmpPath, path, B_PATH_NAME_LENGTH) >= B_PATH_NAME_LENGTH)
4527			return B_NAME_TOO_LONG;
4528
4529		// get the dir vnode and the leaf name
4530		leaf = _leaf;
4531		status_t error = path_to_dir_vnode(tmpPath, dirNode, leaf, kernel);
4532		if (error != B_OK)
4533			return error;
4534	} else {
4535		// No path. Create the node in the root FS.
4536		dirNode.SetTo(sRoot);
4537		inc_vnode_ref_count(dirNode.Get());
4538	}
4539
4540	// check support for creating special nodes
4541	if (!HAS_FS_CALL(dirNode, create_special_node))
4542		return B_UNSUPPORTED;
4543
4544	// create the node
4545	fs_vnode superVnode;
4546	ino_t nodeID;
4547	status_t status = FS_CALL(dirNode.Get(), create_special_node, leaf, subVnode,
4548		mode, flags, _superVnode != NULL ? _superVnode : &superVnode, &nodeID);
4549	if (status != B_OK)
4550		return status;
4551
4552	// lookup the node
4553	rw_lock_read_lock(&sVnodeLock);
4554	*_createdVnode = lookup_vnode(dirNode->mount->id, nodeID);
4555	rw_lock_read_unlock(&sVnodeLock);
4556
4557	if (*_createdVnode == NULL) {
4558		panic("vfs_create_special_node(): lookup of node failed");
4559		return B_ERROR;
4560	}
4561
4562	return B_OK;
4563}
4564
4565
4566extern "C" void
4567vfs_put_vnode(struct vnode* vnode)
4568{
4569	put_vnode(vnode);
4570}
4571
4572
4573extern "C" status_t
4574vfs_get_cwd(dev_t* _mountID, ino_t* _vnodeID)
4575{
4576	// Get current working directory from io context
4577	struct io_context* context = get_current_io_context(false);
4578	status_t status = B_OK;
4579
4580	mutex_lock(&context->io_mutex);
4581
4582	if (context->cwd != NULL) {
4583		*_mountID = context->cwd->device;
4584		*_vnodeID = context->cwd->id;
4585	} else
4586		status = B_ERROR;
4587
4588	mutex_unlock(&context->io_mutex);
4589	return status;
4590}
4591
4592
4593status_t
4594vfs_unmount(dev_t mountID, uint32 flags)
4595{
4596	return fs_unmount(NULL, mountID, flags, true);
4597}
4598
4599
4600extern "C" status_t
4601vfs_disconnect_vnode(dev_t mountID, ino_t vnodeID)
4602{
4603	struct vnode* vnode;
4604
4605	status_t status = get_vnode(mountID, vnodeID, &vnode, true, true);
4606	if (status != B_OK)
4607		return status;
4608
4609	disconnect_mount_or_vnode_fds(vnode->mount, vnode);
4610	put_vnode(vnode);
4611	return B_OK;
4612}
4613
4614
4615extern "C" void
4616vfs_free_unused_vnodes(int32 level)
4617{
4618	vnode_low_resource_handler(NULL,
4619		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
4620			| B_KERNEL_RESOURCE_ADDRESS_SPACE,
4621		level);
4622}
4623
4624
4625extern "C" bool
4626vfs_can_page(struct vnode* vnode, void* cookie)
4627{
4628	FUNCTION(("vfs_canpage: vnode %p\n", vnode));
4629
4630	if (HAS_FS_CALL(vnode, can_page))
4631		return FS_CALL(vnode, can_page, cookie);
4632	return false;
4633}
4634
4635
4636extern "C" status_t
4637vfs_read_pages(struct vnode* vnode, void* cookie, off_t pos,
4638	const generic_io_vec* vecs, size_t count, uint32 flags,
4639	generic_size_t* _numBytes)
4640{
4641	FUNCTION(("vfs_read_pages: vnode %p, vecs %p, pos %" B_PRIdOFF "\n", vnode,
4642		vecs, pos));
4643
4644#if VFS_PAGES_IO_TRACING
4645	generic_size_t bytesRequested = *_numBytes;
4646#endif
4647
4648	IORequest request;
4649	status_t status = request.Init(pos, vecs, count, *_numBytes, false, flags);
4650	if (status == B_OK) {
4651		status = vfs_vnode_io(vnode, cookie, &request);
4652		if (status == B_OK)
4653			status = request.Wait();
4654		*_numBytes = request.TransferredBytes();
4655	}
4656
4657	TPIO(ReadPages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4658		status, *_numBytes));
4659
4660	return status;
4661}
4662
4663
4664extern "C" status_t
4665vfs_write_pages(struct vnode* vnode, void* cookie, off_t pos,
4666	const generic_io_vec* vecs, size_t count, uint32 flags,
4667	generic_size_t* _numBytes)
4668{
4669	FUNCTION(("vfs_write_pages: vnode %p, vecs %p, pos %" B_PRIdOFF "\n", vnode,
4670		vecs, pos));
4671
4672#if VFS_PAGES_IO_TRACING
4673	generic_size_t bytesRequested = *_numBytes;
4674#endif
4675
4676	IORequest request;
4677	status_t status = request.Init(pos, vecs, count, *_numBytes, true, flags);
4678	if (status == B_OK) {
4679		status = vfs_vnode_io(vnode, cookie, &request);
4680		if (status == B_OK)
4681			status = request.Wait();
4682		*_numBytes = request.TransferredBytes();
4683	}
4684
4685	TPIO(WritePages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4686		status, *_numBytes));
4687
4688	return status;
4689}
4690
4691
4692/*!	Gets the vnode's VMCache object. If it didn't have one, it will be
4693	created if \a allocate is \c true.
4694	In case it's successful, it will also grab a reference to the cache
4695	it returns.
4696*/
4697extern "C" status_t
4698vfs_get_vnode_cache(struct vnode* vnode, VMCache** _cache, bool allocate)
4699{
4700	if (vnode->cache != NULL) {
4701		vnode->cache->AcquireRef();
4702		*_cache = vnode->cache;
4703		return B_OK;
4704	}
4705
4706	rw_lock_read_lock(&sVnodeLock);
4707	vnode->Lock();
4708
4709	status_t status = B_OK;
4710
4711	// The cache could have been created in the meantime
4712	if (vnode->cache == NULL) {
4713		if (allocate) {
4714			// TODO: actually the vnode needs to be busy already here, or
4715			//	else this won't work...
4716			bool wasBusy = vnode->IsBusy();
4717			vnode->SetBusy(true);
4718
4719			vnode->Unlock();
4720			rw_lock_read_unlock(&sVnodeLock);
4721
4722			status = vm_create_vnode_cache(vnode, &vnode->cache);
4723
4724			rw_lock_read_lock(&sVnodeLock);
4725			vnode->Lock();
4726			vnode->SetBusy(wasBusy);
4727		} else
4728			status = B_BAD_VALUE;
4729	}
4730
4731	vnode->Unlock();
4732	rw_lock_read_unlock(&sVnodeLock);
4733
4734	if (status == B_OK) {
4735		vnode->cache->AcquireRef();
4736		*_cache = vnode->cache;
4737	}
4738
4739	return status;
4740}
4741
4742
4743/*!	Sets the vnode's VMCache object, for subsystems that want to manage
4744	their own.
4745	In case it's successful, it will also grab a reference to the cache
4746	it returns.
4747*/
4748extern "C" status_t
4749vfs_set_vnode_cache(struct vnode* vnode, VMCache* _cache)
4750{
4751	rw_lock_read_lock(&sVnodeLock);
4752	vnode->Lock();
4753
4754	status_t status = B_OK;
4755	if (vnode->cache != NULL) {
4756		status = B_NOT_ALLOWED;
4757	} else {
4758		vnode->cache = _cache;
4759		_cache->AcquireRef();
4760	}
4761
4762	vnode->Unlock();
4763	rw_lock_read_unlock(&sVnodeLock);
4764	return status;
4765}
4766
4767
4768status_t
4769vfs_get_file_map(struct vnode* vnode, off_t offset, size_t size,
4770	file_io_vec* vecs, size_t* _count)
4771{
4772	FUNCTION(("vfs_get_file_map: vnode %p, vecs %p, offset %" B_PRIdOFF
4773		", size = %" B_PRIuSIZE "\n", vnode, vecs, offset, size));
4774
4775	return FS_CALL(vnode, get_file_map, offset, size, vecs, _count);
4776}
4777
4778
4779status_t
4780vfs_stat_vnode(struct vnode* vnode, struct stat* stat)
4781{
4782	status_t status = FS_CALL(vnode, read_stat, stat);
4783
4784	// fill in the st_dev and st_ino fields
4785	if (status == B_OK) {
4786		stat->st_dev = vnode->device;
4787		stat->st_ino = vnode->id;
4788		// the rdev field must stay unset for non-special files
4789		if (!S_ISBLK(stat->st_mode) && !S_ISCHR(stat->st_mode))
4790			stat->st_rdev = -1;
4791	}
4792
4793	return status;
4794}
4795
4796
4797status_t
4798vfs_stat_node_ref(dev_t device, ino_t inode, struct stat* stat)
4799{
4800	struct vnode* vnode;
4801	status_t status = get_vnode(device, inode, &vnode, true, false);
4802	if (status != B_OK)
4803		return status;
4804
4805	status = vfs_stat_vnode(vnode, stat);
4806
4807	put_vnode(vnode);
4808	return status;
4809}
4810
4811
4812status_t
4813vfs_get_vnode_name(struct vnode* vnode, char* name, size_t nameSize)
4814{
4815	return get_vnode_name(vnode, NULL, name, nameSize, true);
4816}
4817
4818
4819status_t
4820vfs_entry_ref_to_path(dev_t device, ino_t inode, const char* leaf,
4821	bool kernel, char* path, size_t pathLength)
4822{
4823	VnodePutter vnode;
4824	status_t status;
4825
4826	// filter invalid leaf names
4827	if (leaf != NULL && (leaf[0] == '\0' || strchr(leaf, '/')))
4828		return B_BAD_VALUE;
4829
4830	// get the vnode matching the dir's node_ref
4831	if (leaf && (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0)) {
4832		// special cases "." and "..": we can directly get the vnode of the
4833		// referenced directory
4834		status = entry_ref_to_vnode(device, inode, leaf, false, kernel, vnode);
4835		leaf = NULL;
4836	} else {
4837		struct vnode* temp = NULL;
4838		status = get_vnode(device, inode, &temp, true, false);
4839		vnode.SetTo(temp);
4840	}
4841	if (status != B_OK)
4842		return status;
4843
4844	// get the directory path
4845	status = dir_vnode_to_path(vnode.Get(), path, pathLength, kernel);
4846	vnode.Unset();
4847		// we don't need the vnode anymore
4848	if (status != B_OK)
4849		return status;
4850
4851	// append the leaf name
4852	if (leaf) {
4853		// insert a directory separator if this is not the file system root
4854		if ((strcmp(path, "/") && strlcat(path, "/", pathLength)
4855				>= pathLength)
4856			|| strlcat(path, leaf, pathLength) >= pathLength) {
4857			return B_NAME_TOO_LONG;
4858		}
4859	}
4860
4861	return B_OK;
4862}
4863
4864
4865/*!	If the given descriptor locked its vnode, that lock will be released. */
4866void
4867vfs_unlock_vnode_if_locked(struct file_descriptor* descriptor)
4868{
4869	struct vnode* vnode = fd_vnode(descriptor);
4870
4871	if (vnode != NULL && vnode->mandatory_locked_by == descriptor)
4872		vnode->mandatory_locked_by = NULL;
4873}
4874
4875
4876/*!	Releases any POSIX locks on the file descriptor. */
4877status_t
4878vfs_release_posix_lock(io_context* context, struct file_descriptor* descriptor)
4879{
4880	struct vnode* vnode = descriptor->u.vnode;
4881	if (vnode == NULL)
4882		return B_OK;
4883
4884	if (HAS_FS_CALL(vnode, release_lock))
4885		return FS_CALL(vnode, release_lock, descriptor->cookie, NULL);
4886
4887	return release_advisory_lock(vnode, context, NULL, NULL);
4888}
4889
4890
4891/*!	Closes all file descriptors of the specified I/O context that
4892	have the O_CLOEXEC flag set.
4893*/
4894void
4895vfs_exec_io_context(io_context* context)
4896{
4897	uint32 i;
4898
4899	for (i = 0; i < context->table_size; i++) {
4900		mutex_lock(&context->io_mutex);
4901
4902		struct file_descriptor* descriptor = context->fds[i];
4903		bool remove = false;
4904
4905		if (descriptor != NULL && fd_close_on_exec(context, i)) {
4906			context->fds[i] = NULL;
4907			context->num_used_fds--;
4908
4909			remove = true;
4910		}
4911
4912		mutex_unlock(&context->io_mutex);
4913
4914		if (remove) {
4915			close_fd(context, descriptor);
4916			put_fd(descriptor);
4917		}
4918	}
4919}
4920
4921
4922/*! Sets up a new io_control structure, and inherits the properties
4923	of the parent io_control if it is given.
4924*/
4925io_context*
4926vfs_new_io_context(io_context* parentContext, bool purgeCloseOnExec)
4927{
4928	io_context* context = (io_context*)malloc(sizeof(io_context));
4929	if (context == NULL)
4930		return NULL;
4931
4932	TIOC(NewIOContext(context, parentContext));
4933
4934	memset(context, 0, sizeof(io_context));
4935	context->ref_count = 1;
4936
4937	MutexLocker parentLocker;
4938
4939	size_t tableSize;
4940	if (parentContext != NULL) {
4941		parentLocker.SetTo(parentContext->io_mutex, false);
4942		tableSize = parentContext->table_size;
4943	} else
4944		tableSize = DEFAULT_FD_TABLE_SIZE;
4945
4946	// allocate space for FDs and their close-on-exec flag
4947	context->fds = (file_descriptor**)malloc(
4948		sizeof(struct file_descriptor*) * tableSize
4949		+ sizeof(struct select_info**) * tableSize
4950		+ (tableSize + 7) / 8);
4951	if (context->fds == NULL) {
4952		free(context);
4953		return NULL;
4954	}
4955
4956	context->select_infos = (select_info**)(context->fds + tableSize);
4957	context->fds_close_on_exec = (uint8*)(context->select_infos + tableSize);
4958
4959	memset(context->fds, 0, sizeof(struct file_descriptor*) * tableSize
4960		+ sizeof(struct select_info**) * tableSize
4961		+ (tableSize + 7) / 8);
4962
4963	mutex_init(&context->io_mutex, "I/O context");
4964
4965	// Copy all parent file descriptors
4966
4967	if (parentContext != NULL) {
4968		size_t i;
4969
4970		mutex_lock(&sIOContextRootLock);
4971		context->root = parentContext->root;
4972		if (context->root)
4973			inc_vnode_ref_count(context->root);
4974		mutex_unlock(&sIOContextRootLock);
4975
4976		context->cwd = parentContext->cwd;
4977		if (context->cwd)
4978			inc_vnode_ref_count(context->cwd);
4979
4980		if (parentContext->inherit_fds) {
4981			for (i = 0; i < tableSize; i++) {
4982				struct file_descriptor* descriptor = parentContext->fds[i];
4983
4984				if (descriptor != NULL
4985					&& (descriptor->open_mode & O_DISCONNECTED) == 0) {
4986					bool closeOnExec = fd_close_on_exec(parentContext, i);
4987					if (closeOnExec && purgeCloseOnExec)
4988						continue;
4989
4990					TFD(InheritFD(context, i, descriptor, parentContext));
4991
4992					context->fds[i] = descriptor;
4993					context->num_used_fds++;
4994					atomic_add(&descriptor->ref_count, 1);
4995					atomic_add(&descriptor->open_count, 1);
4996
4997					if (closeOnExec)
4998						fd_set_close_on_exec(context, i, true);
4999				}
5000			}
5001		}
5002
5003		parentLocker.Unlock();
5004	} else {
5005		context->root = sRoot;
5006		context->cwd = sRoot;
5007
5008		if (context->root)
5009			inc_vnode_ref_count(context->root);
5010
5011		if (context->cwd)
5012			inc_vnode_ref_count(context->cwd);
5013	}
5014
5015	context->table_size = tableSize;
5016	context->inherit_fds = parentContext != NULL;
5017
5018	list_init(&context->node_monitors);
5019	context->max_monitors = DEFAULT_NODE_MONITORS;
5020
5021	return context;
5022}
5023
5024
5025void
5026vfs_get_io_context(io_context* context)
5027{
5028	atomic_add(&context->ref_count, 1);
5029}
5030
5031
5032void
5033vfs_put_io_context(io_context* context)
5034{
5035	if (atomic_add(&context->ref_count, -1) == 1)
5036		free_io_context(context);
5037}
5038
5039
5040status_t
5041vfs_resize_fd_table(struct io_context* context, uint32 newSize)
5042{
5043	if (newSize == 0 || newSize > MAX_FD_TABLE_SIZE)
5044		return B_BAD_VALUE;
5045
5046	TIOC(ResizeIOContext(context, newSize));
5047
5048	MutexLocker _(context->io_mutex);
5049
5050	uint32 oldSize = context->table_size;
5051	int oldCloseOnExitBitmapSize = (oldSize + 7) / 8;
5052	int newCloseOnExitBitmapSize = (newSize + 7) / 8;
5053
5054	// If the tables shrink, make sure none of the fds being dropped are in use.
5055	if (newSize < oldSize) {
5056		for (uint32 i = oldSize; i-- > newSize;) {
5057			if (context->fds[i])
5058				return B_BUSY;
5059		}
5060	}
5061
5062	// store pointers to the old tables
5063	file_descriptor** oldFDs = context->fds;
5064	select_info** oldSelectInfos = context->select_infos;
5065	uint8* oldCloseOnExecTable = context->fds_close_on_exec;
5066
5067	// allocate new tables
5068	file_descriptor** newFDs = (file_descriptor**)malloc(
5069		sizeof(struct file_descriptor*) * newSize
5070		+ sizeof(struct select_infos**) * newSize
5071		+ newCloseOnExitBitmapSize);
5072	if (newFDs == NULL)
5073		return B_NO_MEMORY;
5074
5075	context->fds = newFDs;
5076	context->select_infos = (select_info**)(context->fds + newSize);
5077	context->fds_close_on_exec = (uint8*)(context->select_infos + newSize);
5078	context->table_size = newSize;
5079
5080	// copy entries from old tables
5081	uint32 toCopy = min_c(oldSize, newSize);
5082
5083	memcpy(context->fds, oldFDs, sizeof(void*) * toCopy);
5084	memcpy(context->select_infos, oldSelectInfos, sizeof(void*) * toCopy);
5085	memcpy(context->fds_close_on_exec, oldCloseOnExecTable,
5086		min_c(oldCloseOnExitBitmapSize, newCloseOnExitBitmapSize));
5087
5088	// clear additional entries, if the tables grow
5089	if (newSize > oldSize) {
5090		memset(context->fds + oldSize, 0, sizeof(void*) * (newSize - oldSize));
5091		memset(context->select_infos + oldSize, 0,
5092			sizeof(void*) * (newSize - oldSize));
5093		memset(context->fds_close_on_exec + oldCloseOnExitBitmapSize, 0,
5094			newCloseOnExitBitmapSize - oldCloseOnExitBitmapSize);
5095	}
5096
5097	free(oldFDs);
5098
5099	return B_OK;
5100}
5101
5102
5103/*!	\brief Resolves a vnode to the vnode it is covered by, if any.
5104
5105	Given an arbitrary vnode (identified by mount and node ID), the function
5106	checks, whether the vnode is covered by another vnode. If it is, the
5107	function returns the mount and node ID of the covering vnode. Otherwise
5108	it simply returns the supplied mount and node ID.
5109
5110	In case of error (e.g. the supplied node could not be found) the variables
5111	for storing the resolved mount and node ID remain untouched and an error
5112	code is returned.
5113
5114	\param mountID The mount ID of the vnode in question.
5115	\param nodeID The node ID of the vnode in question.
5116	\param resolvedMountID Pointer to storage for the resolved mount ID.
5117	\param resolvedNodeID Pointer to storage for the resolved node ID.
5118	\return
5119	- \c B_OK, if everything went fine,
5120	- another error code, if something went wrong.
5121*/
5122status_t
5123vfs_resolve_vnode_to_covering_vnode(dev_t mountID, ino_t nodeID,
5124	dev_t* resolvedMountID, ino_t* resolvedNodeID)
5125{
5126	// get the node
5127	struct vnode* node;
5128	status_t error = get_vnode(mountID, nodeID, &node, true, false);
5129	if (error != B_OK)
5130		return error;
5131
5132	// resolve the node
5133	if (Vnode* coveringNode = get_covering_vnode(node)) {
5134		put_vnode(node);
5135		node = coveringNode;
5136	}
5137
5138	// set the return values
5139	*resolvedMountID = node->device;
5140	*resolvedNodeID = node->id;
5141
5142	put_vnode(node);
5143
5144	return B_OK;
5145}
5146
5147
5148status_t
5149vfs_get_mount_point(dev_t mountID, dev_t* _mountPointMountID,
5150	ino_t* _mountPointNodeID)
5151{
5152	ReadLocker nodeLocker(sVnodeLock);
5153	ReadLocker mountLocker(sMountLock);
5154
5155	struct fs_mount* mount = find_mount(mountID);
5156	if (mount == NULL)
5157		return B_BAD_VALUE;
5158
5159	Vnode* mountPoint = mount->covers_vnode;
5160
5161	*_mountPointMountID = mountPoint->device;
5162	*_mountPointNodeID = mountPoint->id;
5163
5164	return B_OK;
5165}
5166
5167
5168status_t
5169vfs_bind_mount_directory(dev_t mountID, ino_t nodeID, dev_t coveredMountID,
5170	ino_t coveredNodeID)
5171{
5172	// get the vnodes
5173	Vnode* vnode;
5174	status_t error = get_vnode(mountID, nodeID, &vnode, true, false);
5175	if (error != B_OK)
5176		return B_BAD_VALUE;
5177	VnodePutter vnodePutter(vnode);
5178
5179	Vnode* coveredVnode;
5180	error = get_vnode(coveredMountID, coveredNodeID, &coveredVnode, true,
5181		false);
5182	if (error != B_OK)
5183		return B_BAD_VALUE;
5184	VnodePutter coveredVnodePutter(coveredVnode);
5185
5186	// establish the covered/covering links
5187	WriteLocker locker(sVnodeLock);
5188
5189	if (vnode->covers != NULL || coveredVnode->covered_by != NULL
5190		|| vnode->mount->unmounting || coveredVnode->mount->unmounting) {
5191		return B_BUSY;
5192	}
5193
5194	vnode->covers = coveredVnode;
5195	vnode->SetCovering(true);
5196
5197	coveredVnode->covered_by = vnode;
5198	coveredVnode->SetCovered(true);
5199
5200	// the vnodes do now reference each other
5201	inc_vnode_ref_count(vnode);
5202	inc_vnode_ref_count(coveredVnode);
5203
5204	return B_OK;
5205}
5206
5207
5208int
5209vfs_getrlimit(int resource, struct rlimit* rlp)
5210{
5211	if (!rlp)
5212		return B_BAD_ADDRESS;
5213
5214	switch (resource) {
5215		case RLIMIT_NOFILE:
5216		{
5217			struct io_context* context = get_current_io_context(false);
5218			MutexLocker _(context->io_mutex);
5219
5220			rlp->rlim_cur = context->table_size;
5221			rlp->rlim_max = MAX_FD_TABLE_SIZE;
5222			return 0;
5223		}
5224
5225		case RLIMIT_NOVMON:
5226		{
5227			struct io_context* context = get_current_io_context(false);
5228			MutexLocker _(context->io_mutex);
5229
5230			rlp->rlim_cur = context->max_monitors;
5231			rlp->rlim_max = MAX_NODE_MONITORS;
5232			return 0;
5233		}
5234
5235		default:
5236			return B_BAD_VALUE;
5237	}
5238}
5239
5240
5241int
5242vfs_setrlimit(int resource, const struct rlimit* rlp)
5243{
5244	if (!rlp)
5245		return B_BAD_ADDRESS;
5246
5247	switch (resource) {
5248		case RLIMIT_NOFILE:
5249			/* TODO: check getuid() */
5250			if (rlp->rlim_max != RLIM_SAVED_MAX
5251				&& rlp->rlim_max != MAX_FD_TABLE_SIZE)
5252				return B_NOT_ALLOWED;
5253
5254			return vfs_resize_fd_table(get_current_io_context(false),
5255				rlp->rlim_cur);
5256
5257		case RLIMIT_NOVMON:
5258			/* TODO: check getuid() */
5259			if (rlp->rlim_max != RLIM_SAVED_MAX
5260				&& rlp->rlim_max != MAX_NODE_MONITORS)
5261				return B_NOT_ALLOWED;
5262
5263			return resize_monitor_table(get_current_io_context(false),
5264				rlp->rlim_cur);
5265
5266		default:
5267			return B_BAD_VALUE;
5268	}
5269}
5270
5271
5272status_t
5273vfs_init(kernel_args* args)
5274{
5275	vnode::StaticInit();
5276
5277	sVnodeTable = new(std::nothrow) VnodeTable();
5278	if (sVnodeTable == NULL || sVnodeTable->Init(VNODE_HASH_TABLE_SIZE) != B_OK)
5279		panic("vfs_init: error creating vnode hash table\n");
5280
5281	struct vnode dummy_vnode;
5282	list_init_etc(&sUnusedVnodeList, offset_of_member(dummy_vnode, unused_link));
5283
5284	struct fs_mount dummyMount;
5285	sMountsTable = new(std::nothrow) MountTable();
5286	if (sMountsTable == NULL
5287			|| sMountsTable->Init(MOUNTS_HASH_TABLE_SIZE) != B_OK)
5288		panic("vfs_init: error creating mounts hash table\n");
5289
5290	sPathNameCache = create_object_cache("vfs path names",
5291		B_PATH_NAME_LENGTH + 1, 8, NULL, NULL, NULL);
5292	if (sPathNameCache == NULL)
5293		panic("vfs_init: error creating path name object_cache\n");
5294
5295	sVnodeCache = create_object_cache("vfs vnodes",
5296		sizeof(struct vnode), 8, NULL, NULL, NULL);
5297	if (sVnodeCache == NULL)
5298		panic("vfs_init: error creating vnode object_cache\n");
5299
5300	sFileDescriptorCache = create_object_cache("vfs fds",
5301		sizeof(file_descriptor), 8, NULL, NULL, NULL);
5302	if (sFileDescriptorCache == NULL)
5303		panic("vfs_init: error creating file descriptor object_cache\n");
5304
5305	node_monitor_init();
5306
5307	sRoot = NULL;
5308
5309	recursive_lock_init(&sMountOpLock, "vfs_mount_op_lock");
5310
5311	if (block_cache_init() != B_OK)
5312		return B_ERROR;
5313
5314#ifdef ADD_DEBUGGER_COMMANDS
5315	// add some debugger commands
5316	add_debugger_command_etc("vnode", &dump_vnode,
5317		"Print info about the specified vnode",
5318		"[ \"-p\" ] ( <vnode> | <devID> <nodeID> )\n"
5319		"Prints information about the vnode specified by address <vnode> or\n"
5320		"<devID>, <vnodeID> pair. If \"-p\" is given, a path of the vnode is\n"
5321		"constructed and printed. It might not be possible to construct a\n"
5322		"complete path, though.\n",
5323		0);
5324	add_debugger_command("vnodes", &dump_vnodes,
5325		"list all vnodes (from the specified device)");
5326	add_debugger_command("vnode_caches", &dump_vnode_caches,
5327		"list all vnode caches");
5328	add_debugger_command("mount", &dump_mount,
5329		"info about the specified fs_mount");
5330	add_debugger_command("mounts", &dump_mounts, "list all fs_mounts");
5331	add_debugger_command("io_context", &dump_io_context,
5332		"info about the I/O context");
5333	add_debugger_command("vnode_usage", &dump_vnode_usage,
5334		"info about vnode usage");
5335#endif
5336
5337	register_low_resource_handler(&vnode_low_resource_handler, NULL,
5338		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
5339			| B_KERNEL_RESOURCE_ADDRESS_SPACE,
5340		0);
5341
5342	fifo_init();
5343	file_map_init();
5344
5345	return file_cache_init();
5346}
5347
5348
5349//	#pragma mark - fd_ops implementations
5350
5351
5352/*!
5353	Calls fs_open() on the given vnode and returns a new
5354	file descriptor for it
5355*/
5356static int
5357open_vnode(struct vnode* vnode, int openMode, bool kernel)
5358{
5359	void* cookie;
5360	status_t status = FS_CALL(vnode, open, openMode, &cookie);
5361	if (status != B_OK)
5362		return status;
5363
5364	int fd = get_new_fd(FDTYPE_FILE, NULL, vnode, cookie, openMode, kernel);
5365	if (fd < 0) {
5366		FS_CALL(vnode, close, cookie);
5367		FS_CALL(vnode, free_cookie, cookie);
5368	}
5369	return fd;
5370}
5371
5372
5373/*!
5374	Calls fs_open() on the given vnode and returns a new
5375	file descriptor for it
5376*/
5377static int
5378create_vnode(struct vnode* directory, const char* name, int openMode,
5379	int perms, bool kernel)
5380{
5381	bool traverse = ((openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0);
5382	status_t status = B_ERROR;
5383	VnodePutter vnode, dirPutter;
5384	void* cookie;
5385	ino_t newID;
5386	char clonedName[B_FILE_NAME_LENGTH + 1];
5387
5388	// This is somewhat tricky: If the entry already exists, the FS responsible
5389	// for the directory might not necessarily also be the one responsible for
5390	// the node the entry refers to (e.g. in case of mount points or FIFOs). So
5391	// we can actually never call the create() hook without O_EXCL. Instead we
5392	// try to look the entry up first. If it already exists, we just open the
5393	// node (unless O_EXCL), otherwise we call create() with O_EXCL. This
5394	// introduces a race condition, since someone else might have created the
5395	// entry in the meantime. We hope the respective FS returns the correct
5396	// error code and retry (up to 3 times) again.
5397
5398	for (int i = 0; i < 3 && status != B_OK; i++) {
5399		bool create = false;
5400
5401		// look the node up
5402		{
5403			struct vnode* entry = NULL;
5404			status = lookup_dir_entry(directory, name, &entry);
5405			vnode.SetTo(entry);
5406		}
5407		if (status == B_OK) {
5408			if ((openMode & O_EXCL) != 0)
5409				return B_FILE_EXISTS;
5410
5411			// If the node is a symlink, we have to follow it, unless
5412			// O_NOTRAVERSE is set.
5413			if (S_ISLNK(vnode->Type()) && traverse) {
5414				vnode.Unset();
5415				if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH)
5416						>= B_FILE_NAME_LENGTH) {
5417					return B_NAME_TOO_LONG;
5418				}
5419
5420				inc_vnode_ref_count(directory);
5421				dirPutter.Unset();
5422				status = vnode_path_to_vnode(directory, clonedName, true,
5423					kernel, vnode, NULL, clonedName);
5424				if (status != B_OK) {
5425					// vnode is not found, but maybe it has a parent and we can create it from
5426					// there. In that case, vnode_path_to_vnode has set vnode to the latest
5427					// directory found in the path
5428					if (status == B_ENTRY_NOT_FOUND) {
5429						directory = vnode.Detach();
5430						dirPutter.SetTo(directory);
5431						name = clonedName;
5432						create = true;
5433					} else
5434						return status;
5435				}
5436			}
5437
5438			if (!create) {
5439				if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type()))
5440					return B_LINK_LIMIT;
5441
5442				int fd = open_vnode(vnode.Get(), openMode & ~O_CREAT, kernel);
5443				// on success keep the vnode reference for the FD
5444				if (fd >= 0)
5445					vnode.Detach();
5446
5447				return fd;
5448			}
5449		}
5450
5451		// it doesn't exist yet -- try to create it
5452
5453		if (!HAS_FS_CALL(directory, create))
5454			return B_READ_ONLY_DEVICE;
5455
5456		status = FS_CALL(directory, create, name, openMode | O_EXCL, perms,
5457			&cookie, &newID);
5458		if (status != B_OK
5459			&& ((openMode & O_EXCL) != 0 || status != B_FILE_EXISTS)) {
5460			return status;
5461		}
5462	}
5463
5464	if (status != B_OK)
5465		return status;
5466
5467	// the node has been created successfully
5468
5469	rw_lock_read_lock(&sVnodeLock);
5470	vnode.SetTo(lookup_vnode(directory->device, newID));
5471	rw_lock_read_unlock(&sVnodeLock);
5472
5473	if (!vnode.IsSet()) {
5474		panic("vfs: fs_create() returned success but there is no vnode, "
5475			"mount ID %" B_PRIdDEV "!\n", directory->device);
5476		return B_BAD_VALUE;
5477	}
5478
5479	int fd = get_new_fd(FDTYPE_FILE, NULL, vnode.Get(), cookie, openMode, kernel);
5480	if (fd >= 0) {
5481		vnode.Detach();
5482		return fd;
5483	}
5484
5485	status = fd;
5486
5487	// something went wrong, clean up
5488
5489	FS_CALL(vnode.Get(), close, cookie);
5490	FS_CALL(vnode.Get(), free_cookie, cookie);
5491
5492	FS_CALL(directory, unlink, name);
5493
5494	return status;
5495}
5496
5497
5498/*! Calls fs open_dir() on the given vnode and returns a new
5499	file descriptor for it
5500*/
5501static int
5502open_dir_vnode(struct vnode* vnode, bool kernel)
5503{
5504	if (!HAS_FS_CALL(vnode, open_dir))
5505		return B_UNSUPPORTED;
5506
5507	void* cookie;
5508	status_t status = FS_CALL(vnode, open_dir, &cookie);
5509	if (status != B_OK)
5510		return status;
5511
5512	// directory is opened, create a fd
5513	status = get_new_fd(FDTYPE_DIR, NULL, vnode, cookie, O_CLOEXEC, kernel);
5514	if (status >= 0)
5515		return status;
5516
5517	FS_CALL(vnode, close_dir, cookie);
5518	FS_CALL(vnode, free_dir_cookie, cookie);
5519
5520	return status;
5521}
5522
5523
5524/*! Calls fs open_attr_dir() on the given vnode and returns a new
5525	file descriptor for it.
5526	Used by attr_dir_open(), and attr_dir_open_fd().
5527*/
5528static int
5529open_attr_dir_vnode(struct vnode* vnode, bool kernel)
5530{
5531	if (!HAS_FS_CALL(vnode, open_attr_dir))
5532		return B_UNSUPPORTED;
5533
5534	void* cookie;
5535	status_t status = FS_CALL(vnode, open_attr_dir, &cookie);
5536	if (status != B_OK)
5537		return status;
5538
5539	// directory is opened, create a fd
5540	status = get_new_fd(FDTYPE_ATTR_DIR, NULL, vnode, cookie, O_CLOEXEC,
5541		kernel);
5542	if (status >= 0)
5543		return status;
5544
5545	FS_CALL(vnode, close_attr_dir, cookie);
5546	FS_CALL(vnode, free_attr_dir_cookie, cookie);
5547
5548	return status;
5549}
5550
5551
5552static int
5553file_create_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5554	int openMode, int perms, bool kernel)
5555{
5556	FUNCTION(("file_create_entry_ref: name = '%s', omode %x, perms %d, "
5557		"kernel %d\n", name, openMode, perms, kernel));
5558
5559	// get directory to put the new file in
5560	struct vnode* directory;
5561	status_t status = get_vnode(mountID, directoryID, &directory, true, false);
5562	if (status != B_OK)
5563		return status;
5564
5565	status = create_vnode(directory, name, openMode, perms, kernel);
5566	put_vnode(directory);
5567
5568	return status;
5569}
5570
5571
5572static int
5573file_create(int fd, char* path, int openMode, int perms, bool kernel)
5574{
5575	FUNCTION(("file_create: path '%s', omode %x, perms %d, kernel %d\n", path,
5576		openMode, perms, kernel));
5577
5578	// get directory to put the new file in
5579	char name[B_FILE_NAME_LENGTH];
5580	VnodePutter directory;
5581	status_t status = fd_and_path_to_dir_vnode(fd, path, directory, name,
5582		kernel);
5583	if (status < 0)
5584		return status;
5585
5586	return create_vnode(directory.Get(), name, openMode, perms, kernel);
5587}
5588
5589
5590static int
5591file_open_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5592	int openMode, bool kernel)
5593{
5594	if (name == NULL || *name == '\0')
5595		return B_BAD_VALUE;
5596
5597	FUNCTION(("file_open_entry_ref(ref = (%" B_PRId32 ", %" B_PRId64 ", %s), "
5598		"openMode = %d)\n", mountID, directoryID, name, openMode));
5599
5600	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5601
5602	// get the vnode matching the entry_ref
5603	VnodePutter vnode;
5604	status_t status = entry_ref_to_vnode(mountID, directoryID, name, traverse,
5605		kernel, vnode);
5606	if (status != B_OK)
5607		return status;
5608
5609	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type()))
5610		return B_LINK_LIMIT;
5611
5612	int newFD = open_vnode(vnode.Get(), openMode, kernel);
5613	if (newFD >= 0) {
5614		cache_node_opened(vnode.Get(), FDTYPE_FILE, vnode->cache, mountID,
5615			directoryID, vnode->id, name);
5616
5617		// The vnode reference has been transferred to the FD
5618		vnode.Detach();
5619	}
5620
5621	return newFD;
5622}
5623
5624
5625static int
5626file_open(int fd, char* path, int openMode, bool kernel)
5627{
5628	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5629
5630	FUNCTION(("file_open: fd: %d, entry path = '%s', omode %d, kernel %d\n",
5631		fd, path, openMode, kernel));
5632
5633	// get the vnode matching the vnode + path combination
5634	VnodePutter vnode;
5635	ino_t parentID;
5636	status_t status = fd_and_path_to_vnode(fd, path, traverse, vnode,
5637		&parentID, kernel);
5638	if (status != B_OK)
5639		return status;
5640
5641	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type()))
5642		return B_LINK_LIMIT;
5643
5644	// open the vnode
5645	int newFD = open_vnode(vnode.Get(), openMode, kernel);
5646	if (newFD >= 0) {
5647		cache_node_opened(vnode.Get(), FDTYPE_FILE, vnode->cache,
5648			vnode->device, parentID, vnode->id, NULL);
5649
5650		// The vnode reference has been transferred to the FD
5651		vnode.Detach();
5652	}
5653
5654	return newFD;
5655}
5656
5657
5658static status_t
5659file_close(struct file_descriptor* descriptor)
5660{
5661	struct vnode* vnode = descriptor->u.vnode;
5662	status_t status = B_OK;
5663
5664	FUNCTION(("file_close(descriptor = %p)\n", descriptor));
5665
5666	cache_node_closed(vnode, FDTYPE_FILE, vnode->cache, vnode->device,
5667		vnode->id);
5668	if (HAS_FS_CALL(vnode, close)) {
5669		status = FS_CALL(vnode, close, descriptor->cookie);
5670	}
5671
5672	if (status == B_OK) {
5673		// remove all outstanding locks for this team
5674		if (HAS_FS_CALL(vnode, release_lock))
5675			status = FS_CALL(vnode, release_lock, descriptor->cookie, NULL);
5676		else
5677			status = release_advisory_lock(vnode, NULL, descriptor, NULL);
5678	}
5679	return status;
5680}
5681
5682
5683static void
5684file_free_fd(struct file_descriptor* descriptor)
5685{
5686	struct vnode* vnode = descriptor->u.vnode;
5687
5688	if (vnode != NULL) {
5689		FS_CALL(vnode, free_cookie, descriptor->cookie);
5690		put_vnode(vnode);
5691	}
5692}
5693
5694
5695static status_t
5696file_read(struct file_descriptor* descriptor, off_t pos, void* buffer,
5697	size_t* length)
5698{
5699	struct vnode* vnode = descriptor->u.vnode;
5700	FUNCTION(("file_read: buf %p, pos %" B_PRIdOFF ", len %p = %ld\n", buffer,
5701		pos, length, *length));
5702
5703	if (S_ISDIR(vnode->Type()))
5704		return B_IS_A_DIRECTORY;
5705	if (pos != -1 && descriptor->pos == -1)
5706		return ESPIPE;
5707
5708	return FS_CALL(vnode, read, descriptor->cookie, pos, buffer, length);
5709}
5710
5711
5712static status_t
5713file_write(struct file_descriptor* descriptor, off_t pos, const void* buffer,
5714	size_t* length)
5715{
5716	struct vnode* vnode = descriptor->u.vnode;
5717	FUNCTION(("file_write: buf %p, pos %" B_PRIdOFF ", len %p\n", buffer, pos,
5718		length));
5719
5720	if (S_ISDIR(vnode->Type()))
5721		return B_IS_A_DIRECTORY;
5722	if (pos != -1 && descriptor->pos == -1)
5723		return ESPIPE;
5724
5725	if (!HAS_FS_CALL(vnode, write))
5726		return B_READ_ONLY_DEVICE;
5727
5728	return FS_CALL(vnode, write, descriptor->cookie, pos, buffer, length);
5729}
5730
5731
5732static off_t
5733file_seek(struct file_descriptor* descriptor, off_t pos, int seekType)
5734{
5735	struct vnode* vnode = descriptor->u.vnode;
5736	off_t offset;
5737	bool isDevice = false;
5738
5739	FUNCTION(("file_seek(pos = %" B_PRIdOFF ", seekType = %d)\n", pos,
5740		seekType));
5741
5742	if (descriptor->pos == -1)
5743		return ESPIPE;
5744
5745	switch (vnode->Type() & S_IFMT) {
5746		// drivers publish block devices as chr, so pick both
5747		case S_IFBLK:
5748		case S_IFCHR:
5749			isDevice = true;
5750			break;
5751	}
5752
5753	switch (seekType) {
5754		case SEEK_SET:
5755			offset = 0;
5756			break;
5757		case SEEK_CUR:
5758			offset = descriptor->pos;
5759			break;
5760		case SEEK_END:
5761		{
5762			// stat() the node
5763			if (!HAS_FS_CALL(vnode, read_stat))
5764				return B_UNSUPPORTED;
5765
5766			struct stat stat;
5767			status_t status = FS_CALL(vnode, read_stat, &stat);
5768			if (status != B_OK)
5769				return status;
5770
5771			offset = stat.st_size;
5772
5773			if (offset == 0 && isDevice) {
5774				// stat() on regular drivers doesn't report size
5775				device_geometry geometry;
5776
5777				if (HAS_FS_CALL(vnode, ioctl)) {
5778					status = FS_CALL(vnode, ioctl, descriptor->cookie,
5779						B_GET_GEOMETRY, &geometry, sizeof(geometry));
5780					if (status == B_OK)
5781						offset = (off_t)geometry.bytes_per_sector
5782							* geometry.sectors_per_track
5783							* geometry.cylinder_count
5784							* geometry.head_count;
5785				}
5786			}
5787
5788			break;
5789		}
5790		case SEEK_DATA:
5791		case SEEK_HOLE:
5792		{
5793			status_t status = B_BAD_VALUE;
5794			if (HAS_FS_CALL(vnode, ioctl)) {
5795				offset = pos;
5796				status = FS_CALL(vnode, ioctl, descriptor->cookie,
5797					seekType == SEEK_DATA ? FIOSEEKDATA : FIOSEEKHOLE,
5798					&offset, sizeof(offset));
5799				if (status == B_OK) {
5800					if (offset > pos)
5801						offset -= pos;
5802					break;
5803				}
5804			}
5805			if (status != B_BAD_VALUE && status != B_DEV_INVALID_IOCTL)
5806				return status;
5807
5808			// basic implementation with stat() the node
5809			if (!HAS_FS_CALL(vnode, read_stat) || isDevice)
5810				return B_BAD_VALUE;
5811
5812			struct stat stat;
5813			status = FS_CALL(vnode, read_stat, &stat);
5814			if (status != B_OK)
5815				return status;
5816
5817			off_t end = stat.st_size;
5818			if (pos >= end)
5819				return ENXIO;
5820			offset = seekType == SEEK_HOLE ? end - pos : 0;
5821			break;
5822		}
5823		default:
5824			return B_BAD_VALUE;
5825	}
5826
5827	// assumes off_t is 64 bits wide
5828	if (offset > 0 && LONGLONG_MAX - offset < pos)
5829		return B_BUFFER_OVERFLOW;
5830
5831	pos += offset;
5832	if (pos < 0)
5833		return B_BAD_VALUE;
5834
5835	return descriptor->pos = pos;
5836}
5837
5838
5839static status_t
5840file_select(struct file_descriptor* descriptor, uint8 event,
5841	struct selectsync* sync)
5842{
5843	FUNCTION(("file_select(%p, %u, %p)\n", descriptor, event, sync));
5844
5845	struct vnode* vnode = descriptor->u.vnode;
5846
5847	// If the FS has no select() hook, notify select() now.
5848	if (!HAS_FS_CALL(vnode, select)) {
5849		if (!SELECT_TYPE_IS_OUTPUT_ONLY(event))
5850			return notify_select_event(sync, event);
5851		else
5852			return B_OK;
5853	}
5854
5855	return FS_CALL(vnode, select, descriptor->cookie, event, sync);
5856}
5857
5858
5859static status_t
5860file_deselect(struct file_descriptor* descriptor, uint8 event,
5861	struct selectsync* sync)
5862{
5863	struct vnode* vnode = descriptor->u.vnode;
5864
5865	if (!HAS_FS_CALL(vnode, deselect))
5866		return B_OK;
5867
5868	return FS_CALL(vnode, deselect, descriptor->cookie, event, sync);
5869}
5870
5871
5872static status_t
5873dir_create_entry_ref(dev_t mountID, ino_t parentID, const char* name, int perms,
5874	bool kernel)
5875{
5876	struct vnode* vnode;
5877	status_t status;
5878
5879	if (name == NULL || *name == '\0')
5880		return B_BAD_VALUE;
5881
5882	FUNCTION(("dir_create_entry_ref(dev = %" B_PRId32 ", ino = %" B_PRId64 ", "
5883		"name = '%s', perms = %d)\n", mountID, parentID, name, perms));
5884
5885	status = get_vnode(mountID, parentID, &vnode, true, false);
5886	if (status != B_OK)
5887		return status;
5888
5889	if (HAS_FS_CALL(vnode, create_dir))
5890		status = FS_CALL(vnode, create_dir, name, perms);
5891	else
5892		status = B_READ_ONLY_DEVICE;
5893
5894	put_vnode(vnode);
5895	return status;
5896}
5897
5898
5899static status_t
5900dir_create(int fd, char* path, int perms, bool kernel)
5901{
5902	char filename[B_FILE_NAME_LENGTH];
5903	status_t status;
5904
5905	FUNCTION(("dir_create: path '%s', perms %d, kernel %d\n", path, perms,
5906		kernel));
5907
5908	VnodePutter vnode;
5909	status = fd_and_path_to_dir_vnode(fd, path, vnode, filename, kernel);
5910	if (status < 0)
5911		return status;
5912
5913	if (HAS_FS_CALL(vnode, create_dir)) {
5914		status = FS_CALL(vnode.Get(), create_dir, filename, perms);
5915	} else
5916		status = B_READ_ONLY_DEVICE;
5917
5918	return status;
5919}
5920
5921
5922static int
5923dir_open_entry_ref(dev_t mountID, ino_t parentID, const char* name, bool kernel)
5924{
5925	FUNCTION(("dir_open_entry_ref()\n"));
5926
5927	if (name && name[0] == '\0')
5928		return B_BAD_VALUE;
5929
5930	// get the vnode matching the entry_ref/node_ref
5931	VnodePutter vnode;
5932	status_t status;
5933	if (name) {
5934		status = entry_ref_to_vnode(mountID, parentID, name, true, kernel,
5935			vnode);
5936	} else {
5937		struct vnode* temp = NULL;
5938		status = get_vnode(mountID, parentID, &temp, true, false);
5939		vnode.SetTo(temp);
5940	}
5941	if (status != B_OK)
5942		return status;
5943
5944	int newFD = open_dir_vnode(vnode.Get(), kernel);
5945	if (newFD >= 0) {
5946		cache_node_opened(vnode.Get(), FDTYPE_DIR, vnode->cache, mountID, parentID,
5947			vnode->id, name);
5948
5949		// The vnode reference has been transferred to the FD
5950		vnode.Detach();
5951	}
5952
5953	return newFD;
5954}
5955
5956
5957static int
5958dir_open(int fd, char* path, bool kernel)
5959{
5960	FUNCTION(("dir_open: fd: %d, entry path = '%s', kernel %d\n", fd, path,
5961		kernel));
5962
5963	// get the vnode matching the vnode + path combination
5964	VnodePutter vnode;
5965	ino_t parentID;
5966	status_t status = fd_and_path_to_vnode(fd, path, true, vnode, &parentID,
5967		kernel);
5968	if (status != B_OK)
5969		return status;
5970
5971	// open the dir
5972	int newFD = open_dir_vnode(vnode.Get(), kernel);
5973	if (newFD >= 0) {
5974		cache_node_opened(vnode.Get(), FDTYPE_DIR, vnode->cache, vnode->device,
5975			parentID, vnode->id, NULL);
5976
5977		// The vnode reference has been transferred to the FD
5978		vnode.Detach();
5979	}
5980
5981	return newFD;
5982}
5983
5984
5985static status_t
5986dir_close(struct file_descriptor* descriptor)
5987{
5988	struct vnode* vnode = descriptor->u.vnode;
5989
5990	FUNCTION(("dir_close(descriptor = %p)\n", descriptor));
5991
5992	cache_node_closed(vnode, FDTYPE_DIR, vnode->cache, vnode->device,
5993		vnode->id);
5994	if (HAS_FS_CALL(vnode, close_dir))
5995		return FS_CALL(vnode, close_dir, descriptor->cookie);
5996
5997	return B_OK;
5998}
5999
6000
6001static void
6002dir_free_fd(struct file_descriptor* descriptor)
6003{
6004	struct vnode* vnode = descriptor->u.vnode;
6005
6006	if (vnode != NULL) {
6007		FS_CALL(vnode, free_dir_cookie, descriptor->cookie);
6008		put_vnode(vnode);
6009	}
6010}
6011
6012
6013static status_t
6014dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
6015	struct dirent* buffer, size_t bufferSize, uint32* _count)
6016{
6017	return dir_read(ioContext, descriptor->u.vnode, descriptor->cookie, buffer,
6018		bufferSize, _count);
6019}
6020
6021
6022static status_t
6023fix_dirent(struct vnode* parent, struct dirent* entry,
6024	struct io_context* ioContext)
6025{
6026	// set d_pdev and d_pino
6027	entry->d_pdev = parent->device;
6028	entry->d_pino = parent->id;
6029
6030	// If this is the ".." entry and the directory covering another vnode,
6031	// we need to replace d_dev and d_ino with the actual values.
6032	if (strcmp(entry->d_name, "..") == 0 && parent->IsCovering()) {
6033		return resolve_covered_parent(parent, &entry->d_dev, &entry->d_ino,
6034			ioContext);
6035	}
6036
6037	// resolve covered vnodes
6038	ReadLocker _(&sVnodeLock);
6039
6040	struct vnode* vnode = lookup_vnode(entry->d_dev, entry->d_ino);
6041	if (vnode != NULL && vnode->covered_by != NULL) {
6042		do {
6043			vnode = vnode->covered_by;
6044		} while (vnode->covered_by != NULL);
6045
6046		entry->d_dev = vnode->device;
6047		entry->d_ino = vnode->id;
6048	}
6049
6050	return B_OK;
6051}
6052
6053
6054static status_t
6055dir_read(struct io_context* ioContext, struct vnode* vnode, void* cookie,
6056	struct dirent* buffer, size_t bufferSize, uint32* _count)
6057{
6058	if (!HAS_FS_CALL(vnode, read_dir))
6059		return B_UNSUPPORTED;
6060
6061	status_t error = FS_CALL(vnode, read_dir, cookie, buffer, bufferSize,
6062		_count);
6063	if (error != B_OK)
6064		return error;
6065
6066	// we need to adjust the read dirents
6067	uint32 count = *_count;
6068	for (uint32 i = 0; i < count; i++) {
6069		error = fix_dirent(vnode, buffer, ioContext);
6070		if (error != B_OK)
6071			return error;
6072
6073		buffer = (struct dirent*)((uint8*)buffer + buffer->d_reclen);
6074	}
6075
6076	return error;
6077}
6078
6079
6080static status_t
6081dir_rewind(struct file_descriptor* descriptor)
6082{
6083	struct vnode* vnode = descriptor->u.vnode;
6084
6085	if (HAS_FS_CALL(vnode, rewind_dir)) {
6086		return FS_CALL(vnode, rewind_dir, descriptor->cookie);
6087	}
6088
6089	return B_UNSUPPORTED;
6090}
6091
6092
6093static status_t
6094dir_remove(int fd, char* path, bool kernel)
6095{
6096	char name[B_FILE_NAME_LENGTH];
6097	status_t status;
6098
6099	if (path != NULL) {
6100		// we need to make sure our path name doesn't stop with "/", ".",
6101		// or ".."
6102		char* lastSlash;
6103		while ((lastSlash = strrchr(path, '/')) != NULL) {
6104			char* leaf = lastSlash + 1;
6105			if (!strcmp(leaf, ".."))
6106				return B_NOT_ALLOWED;
6107
6108			// omit multiple slashes
6109			while (lastSlash > path && lastSlash[-1] == '/')
6110				lastSlash--;
6111
6112			if (leaf[0]
6113				&& strcmp(leaf, ".")) {
6114				break;
6115			}
6116			// "name/" -> "name", or "name/." -> "name"
6117			lastSlash[0] = '\0';
6118		}
6119
6120		if (!strcmp(path, ".") || !strcmp(path, ".."))
6121			return B_NOT_ALLOWED;
6122	}
6123
6124	VnodePutter directory;
6125	status = fd_and_path_to_dir_vnode(fd, path, directory, name, kernel);
6126	if (status != B_OK)
6127		return status;
6128
6129	if (HAS_FS_CALL(directory, remove_dir))
6130		status = FS_CALL(directory.Get(), remove_dir, name);
6131	else
6132		status = B_READ_ONLY_DEVICE;
6133
6134	return status;
6135}
6136
6137
6138static status_t
6139common_ioctl(struct file_descriptor* descriptor, ulong op, void* buffer,
6140	size_t length)
6141{
6142	struct vnode* vnode = descriptor->u.vnode;
6143
6144	if (HAS_FS_CALL(vnode, ioctl))
6145		return FS_CALL(vnode, ioctl, descriptor->cookie, op, buffer, length);
6146
6147	return B_DEV_INVALID_IOCTL;
6148}
6149
6150
6151static status_t
6152common_fcntl(int fd, int op, size_t argument, bool kernel)
6153{
6154	struct flock flock;
6155
6156	FUNCTION(("common_fcntl(fd = %d, op = %d, argument = %lx, %s)\n",
6157		fd, op, argument, kernel ? "kernel" : "user"));
6158
6159	struct io_context* context = get_current_io_context(kernel);
6160
6161	FileDescriptorPutter descriptor(get_fd(context, fd));
6162	if (!descriptor.IsSet())
6163		return B_FILE_ERROR;
6164
6165	struct vnode* vnode = fd_vnode(descriptor.Get());
6166
6167	status_t status = B_OK;
6168
6169	if (op == F_SETLK || op == F_SETLKW || op == F_GETLK) {
6170		if (descriptor->type != FDTYPE_FILE)
6171			status = B_BAD_VALUE;
6172		else if (kernel)
6173			memcpy(&flock, (struct flock*)argument, sizeof(struct flock));
6174		else if (user_memcpy(&flock, (struct flock*)argument,
6175				sizeof(struct flock)) != B_OK)
6176			status = B_BAD_ADDRESS;
6177		if (status != B_OK)
6178			return status;
6179	}
6180
6181	switch (op) {
6182		case F_SETFD:
6183		{
6184			// Set file descriptor flags
6185
6186			// O_CLOEXEC is the only flag available at this time
6187			mutex_lock(&context->io_mutex);
6188			fd_set_close_on_exec(context, fd, (argument & FD_CLOEXEC) != 0);
6189			mutex_unlock(&context->io_mutex);
6190
6191			status = B_OK;
6192			break;
6193		}
6194
6195		case F_GETFD:
6196		{
6197			// Get file descriptor flags
6198			mutex_lock(&context->io_mutex);
6199			status = fd_close_on_exec(context, fd) ? FD_CLOEXEC : 0;
6200			mutex_unlock(&context->io_mutex);
6201			break;
6202		}
6203
6204		case F_SETFL:
6205			// Set file descriptor open mode
6206
6207			// we only accept changes to O_APPEND and O_NONBLOCK
6208			argument &= O_APPEND | O_NONBLOCK;
6209			if (descriptor->ops->fd_set_flags != NULL) {
6210				status = descriptor->ops->fd_set_flags(descriptor.Get(), argument);
6211			} else if (vnode != NULL && HAS_FS_CALL(vnode, set_flags)) {
6212				status = FS_CALL(vnode, set_flags, descriptor->cookie,
6213					(int)argument);
6214			} else
6215				status = B_UNSUPPORTED;
6216
6217			if (status == B_OK) {
6218				// update this descriptor's open_mode field
6219				descriptor->open_mode = (descriptor->open_mode
6220					& ~(O_APPEND | O_NONBLOCK)) | argument;
6221			}
6222
6223			break;
6224
6225		case F_GETFL:
6226			// Get file descriptor open mode
6227			status = descriptor->open_mode;
6228			break;
6229
6230		case F_DUPFD:
6231		case F_DUPFD_CLOEXEC:
6232		{
6233			status = new_fd_etc(context, descriptor.Get(), (int)argument);
6234			if (status >= 0) {
6235				mutex_lock(&context->io_mutex);
6236				fd_set_close_on_exec(context, status, op == F_DUPFD_CLOEXEC);
6237				mutex_unlock(&context->io_mutex);
6238
6239				atomic_add(&descriptor->ref_count, 1);
6240			}
6241			break;
6242		}
6243
6244		case F_GETLK:
6245			if (vnode != NULL) {
6246				struct flock normalizedLock;
6247
6248				memcpy(&normalizedLock, &flock, sizeof(struct flock));
6249				status = normalize_flock(descriptor.Get(), &normalizedLock);
6250				if (status != B_OK)
6251					break;
6252
6253				if (HAS_FS_CALL(vnode, test_lock)) {
6254					status = FS_CALL(vnode, test_lock, descriptor->cookie,
6255						&normalizedLock);
6256				} else
6257					status = test_advisory_lock(vnode, &normalizedLock);
6258				if (status == B_OK) {
6259					if (normalizedLock.l_type == F_UNLCK) {
6260						// no conflicting lock found, copy back the same struct
6261						// we were given except change type to F_UNLCK
6262						flock.l_type = F_UNLCK;
6263						if (kernel) {
6264							memcpy((struct flock*)argument, &flock,
6265								sizeof(struct flock));
6266						} else {
6267							status = user_memcpy((struct flock*)argument,
6268								&flock, sizeof(struct flock));
6269						}
6270					} else {
6271						// a conflicting lock was found, copy back its range and
6272						// type
6273						if (normalizedLock.l_len == OFF_MAX)
6274							normalizedLock.l_len = 0;
6275
6276						if (kernel) {
6277							memcpy((struct flock*)argument,
6278								&normalizedLock, sizeof(struct flock));
6279						} else {
6280							status = user_memcpy((struct flock*)argument,
6281								&normalizedLock, sizeof(struct flock));
6282						}
6283					}
6284				}
6285			} else
6286				status = B_BAD_VALUE;
6287			break;
6288
6289		case F_SETLK:
6290		case F_SETLKW:
6291			status = normalize_flock(descriptor.Get(), &flock);
6292			if (status != B_OK)
6293				break;
6294
6295			if (vnode == NULL) {
6296				status = B_BAD_VALUE;
6297			} else if (flock.l_type == F_UNLCK) {
6298				if (HAS_FS_CALL(vnode, release_lock)) {
6299					status = FS_CALL(vnode, release_lock, descriptor->cookie,
6300						&flock);
6301				} else {
6302					status = release_advisory_lock(vnode, context, NULL,
6303						&flock);
6304				}
6305			} else {
6306				// the open mode must match the lock type
6307				if (((descriptor->open_mode & O_RWMASK) == O_RDONLY
6308						&& flock.l_type == F_WRLCK)
6309					|| ((descriptor->open_mode & O_RWMASK) == O_WRONLY
6310						&& flock.l_type == F_RDLCK))
6311					status = B_FILE_ERROR;
6312				else {
6313					if (HAS_FS_CALL(vnode, acquire_lock)) {
6314						status = FS_CALL(vnode, acquire_lock,
6315							descriptor->cookie, &flock, op == F_SETLKW);
6316					} else {
6317						status = acquire_advisory_lock(vnode, context, NULL,
6318							&flock, op == F_SETLKW);
6319					}
6320				}
6321			}
6322			break;
6323
6324		// ToDo: add support for more ops?
6325
6326		default:
6327			status = B_BAD_VALUE;
6328	}
6329
6330	return status;
6331}
6332
6333
6334static status_t
6335common_sync(int fd, bool kernel)
6336{
6337	FUNCTION(("common_fsync: entry. fd %d kernel %d\n", fd, kernel));
6338
6339	struct vnode* vnode;
6340	FileDescriptorPutter descriptor(get_fd_and_vnode(fd, &vnode, kernel));
6341	if (!descriptor.IsSet())
6342		return B_FILE_ERROR;
6343
6344	status_t status;
6345	if (HAS_FS_CALL(vnode, fsync))
6346		status = FS_CALL_NO_PARAMS(vnode, fsync);
6347	else
6348		status = B_UNSUPPORTED;
6349
6350	return status;
6351}
6352
6353
6354static status_t
6355common_lock_node(int fd, bool kernel)
6356{
6357	struct vnode* vnode;
6358	FileDescriptorPutter descriptor(get_fd_and_vnode(fd, &vnode, kernel));
6359	if (!descriptor.IsSet())
6360		return B_FILE_ERROR;
6361
6362	status_t status = B_OK;
6363
6364	// We need to set the locking atomically - someone
6365	// else might set one at the same time
6366	if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by,
6367			descriptor.Get(), (file_descriptor*)NULL) != NULL)
6368		status = B_BUSY;
6369
6370	return status;
6371}
6372
6373
6374static status_t
6375common_unlock_node(int fd, bool kernel)
6376{
6377	struct vnode* vnode;
6378	FileDescriptorPutter descriptor(get_fd_and_vnode(fd, &vnode, kernel));
6379	if (!descriptor.IsSet())
6380		return B_FILE_ERROR;
6381
6382	status_t status = B_OK;
6383
6384	// We need to set the locking atomically - someone
6385	// else might set one at the same time
6386	if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by,
6387			(file_descriptor*)NULL, descriptor.Get()) != descriptor.Get())
6388		status = B_BAD_VALUE;
6389
6390	return status;
6391}
6392
6393
6394static status_t
6395common_preallocate(int fd, off_t offset, off_t length, bool kernel)
6396{
6397	if (offset < 0 || length == 0)
6398		return B_BAD_VALUE;
6399	if (offset > OFF_MAX - length)
6400		return B_FILE_TOO_LARGE;
6401
6402	struct vnode* vnode;
6403	FileDescriptorPutter descriptor(get_fd_and_vnode(fd, &vnode, kernel));
6404	if (!descriptor.IsSet() || (descriptor->open_mode & O_RWMASK) == O_RDONLY)
6405		return B_FILE_ERROR;
6406
6407	switch (vnode->Type() & S_IFMT) {
6408		case S_IFIFO:
6409		case S_IFSOCK:
6410			return ESPIPE;
6411
6412		case S_IFBLK:
6413		case S_IFCHR:
6414		case S_IFDIR:
6415		case S_IFLNK:
6416			return B_DEVICE_NOT_FOUND;
6417
6418		case S_IFREG:
6419			break;
6420	}
6421
6422	status_t status = B_OK;
6423	if (HAS_FS_CALL(vnode, preallocate)) {
6424		status = FS_CALL(vnode, preallocate, offset, length);
6425	} else {
6426		status = HAS_FS_CALL(vnode, write)
6427			? B_UNSUPPORTED : B_READ_ONLY_DEVICE;
6428	}
6429
6430	return status;
6431}
6432
6433
6434static status_t
6435common_read_link(int fd, char* path, char* buffer, size_t* _bufferSize,
6436	bool kernel)
6437{
6438	VnodePutter vnode;
6439	status_t status;
6440
6441	status = fd_and_path_to_vnode(fd, path, false, vnode, NULL, kernel);
6442	if (status != B_OK)
6443		return status;
6444
6445	if (HAS_FS_CALL(vnode, read_symlink)) {
6446		status = FS_CALL(vnode.Get(), read_symlink, buffer, _bufferSize);
6447	} else
6448		status = B_BAD_VALUE;
6449
6450	return status;
6451}
6452
6453
6454static status_t
6455common_create_symlink(int fd, char* path, const char* toPath, int mode,
6456	bool kernel)
6457{
6458	// path validity checks have to be in the calling function!
6459	char name[B_FILE_NAME_LENGTH];
6460	status_t status;
6461
6462	FUNCTION(("common_create_symlink(fd = %d, path = %s, toPath = %s, "
6463		"mode = %d, kernel = %d)\n", fd, path, toPath, mode, kernel));
6464
6465	VnodePutter vnode;
6466	status = fd_and_path_to_dir_vnode(fd, path, vnode, name, kernel);
6467	if (status != B_OK)
6468		return status;
6469
6470	if (HAS_FS_CALL(vnode,