1/*
2 * Copyright 2005-2013, Ingo Weinhold, ingo_weinhold@gmx.de.
3 * Copyright 2002-2018, Axel D��rfler, axeld@pinc-software.de.
4 * Distributed under the terms of the MIT License.
5 *
6 * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
7 * Distributed under the terms of the NewOS License.
8 */
9
10
11/*! Virtual File System and File System Interface Layer */
12
13
14#include <ctype.h>
15#include <fcntl.h>
16#include <limits.h>
17#include <stddef.h>
18#include <stdio.h>
19#include <string.h>
20#include <sys/file.h>
21#include <sys/ioctl.h>
22#include <sys/resource.h>
23#include <sys/stat.h>
24#include <unistd.h>
25
26#include <fs_attr.h>
27#include <fs_info.h>
28#include <fs_interface.h>
29#include <fs_volume.h>
30#include <NodeMonitor.h>
31#include <OS.h>
32#include <StorageDefs.h>
33
34#include <AutoDeleter.h>
35#include <AutoDeleterDrivers.h>
36#include <block_cache.h>
37#include <boot/kernel_args.h>
38#include <debug_heap.h>
39#include <disk_device_manager/KDiskDevice.h>
40#include <disk_device_manager/KDiskDeviceManager.h>
41#include <disk_device_manager/KDiskDeviceUtils.h>
42#include <disk_device_manager/KDiskSystem.h>
43#include <fd.h>
44#include <file_cache.h>
45#include <fs/node_monitor.h>
46#include <KPath.h>
47#include <lock.h>
48#include <low_resource_manager.h>
49#include <slab/Slab.h>
50#include <StackOrHeapArray.h>
51#include <syscalls.h>
52#include <syscall_restart.h>
53#include <tracing.h>
54#include <util/atomic.h>
55#include <util/AutoLock.h>
56#include <util/ThreadAutoLock.h>
57#include <util/DoublyLinkedList.h>
58#include <vfs.h>
59#include <vm/vm.h>
60#include <vm/VMCache.h>
61#include <wait_for_objects.h>
62
63#include "EntryCache.h"
64#include "fifo.h"
65#include "IORequest.h"
66#include "unused_vnodes.h"
67#include "vfs_tracing.h"
68#include "Vnode.h"
69#include "../cache/vnode_store.h"
70
71
72//#define TRACE_VFS
73#ifdef TRACE_VFS
74#	define TRACE(x) dprintf x
75#	define FUNCTION(x) dprintf x
76#else
77#	define TRACE(x) ;
78#	define FUNCTION(x) ;
79#endif
80
81#define ADD_DEBUGGER_COMMANDS
82
83
84#define HAS_FS_CALL(vnode, op)			(vnode->ops->op != NULL)
85#define HAS_FS_MOUNT_CALL(mount, op)	(mount->volume->ops->op != NULL)
86
87#if KDEBUG
88#	define FS_CALL(vnode, op, params...) \
89		( HAS_FS_CALL(vnode, op) ? \
90			vnode->ops->op(vnode->mount->volume, vnode, params) \
91			: (panic("FS_CALL: vnode %p op " #op " is NULL", vnode), 0))
92#	define FS_CALL_NO_PARAMS(vnode, op) \
93		( HAS_FS_CALL(vnode, op) ? \
94			vnode->ops->op(vnode->mount->volume, vnode) \
95			: (panic("FS_CALL_NO_PARAMS: vnode %p op " #op " is NULL", vnode), 0))
96#	define FS_MOUNT_CALL(mount, op, params...) \
97		( HAS_FS_MOUNT_CALL(mount, op) ? \
98			mount->volume->ops->op(mount->volume, params) \
99			: (panic("FS_MOUNT_CALL: mount %p op " #op " is NULL", mount), 0))
100#	define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
101		( HAS_FS_MOUNT_CALL(mount, op) ? \
102			mount->volume->ops->op(mount->volume) \
103			: (panic("FS_MOUNT_CALL_NO_PARAMS: mount %p op " #op " is NULL", mount), 0))
104#else
105#	define FS_CALL(vnode, op, params...) \
106			vnode->ops->op(vnode->mount->volume, vnode, params)
107#	define FS_CALL_NO_PARAMS(vnode, op) \
108			vnode->ops->op(vnode->mount->volume, vnode)
109#	define FS_MOUNT_CALL(mount, op, params...) \
110			mount->volume->ops->op(mount->volume, params)
111#	define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
112			mount->volume->ops->op(mount->volume)
113#endif
114
115
116const static size_t kMaxPathLength = 65536;
117	// The absolute maximum path length (for getcwd() - this is not depending
118	// on PATH_MAX
119
120
121typedef DoublyLinkedList<vnode> VnodeList;
122
123/*!	\brief Structure to manage a mounted file system
124
125	Note: The root_vnode and root_vnode->covers fields (what others?) are
126	initialized in fs_mount() and not changed afterwards. That is as soon
127	as the mount is mounted and it is made sure it won't be unmounted
128	(e.g. by holding a reference to a vnode of that mount) (read) access
129	to those fields is always safe, even without additional locking. Morever
130	while mounted the mount holds a reference to the root_vnode->covers vnode,
131	and thus making the access path vnode->mount->root_vnode->covers->mount->...
132	safe if a reference to vnode is held (note that for the root mount
133	root_vnode->covers is NULL, though).
134*/
135struct fs_mount {
136	fs_mount()
137		:
138		volume(NULL),
139		device_name(NULL)
140	{
141		mutex_init(&lock, "mount lock");
142	}
143
144	~fs_mount()
145	{
146		mutex_destroy(&lock);
147		free(device_name);
148
149		while (volume) {
150			fs_volume* superVolume = volume->super_volume;
151
152			if (volume->file_system != NULL)
153				put_module(volume->file_system->info.name);
154
155			free(volume->file_system_name);
156			free(volume);
157			volume = superVolume;
158		}
159	}
160
161	struct fs_mount* next;
162	dev_t			id;
163	fs_volume*		volume;
164	char*			device_name;
165	mutex			lock;	// guards the vnodes list
166	struct vnode*	root_vnode;
167	struct vnode*	covers_vnode;	// immutable
168	KPartition*		partition;
169	VnodeList		vnodes;
170	EntryCache		entry_cache;
171	bool			unmounting;
172	bool			owns_file_device;
173};
174
175
176namespace {
177
178struct advisory_lock : public DoublyLinkedListLinkImpl<advisory_lock> {
179	list_link		link;
180	void*			bound_to;
181	team_id			team;
182	pid_t			session;
183	off_t			start;
184	off_t			end;
185	bool			shared;
186};
187
188typedef DoublyLinkedList<advisory_lock> LockList;
189
190} // namespace
191
192
193struct advisory_locking {
194	sem_id			lock;
195	sem_id			wait_sem;
196	LockList		locks;
197
198	advisory_locking()
199		:
200		lock(-1),
201		wait_sem(-1)
202	{
203	}
204
205	~advisory_locking()
206	{
207		if (lock >= 0)
208			delete_sem(lock);
209		if (wait_sem >= 0)
210			delete_sem(wait_sem);
211	}
212};
213
214/*!	\brief Guards sMountsTable.
215
216	The holder is allowed to read/write access the sMountsTable.
217	Manipulation of the fs_mount structures themselves
218	(and their destruction) requires different locks though.
219*/
220static rw_lock sMountLock = RW_LOCK_INITIALIZER("vfs_mount_lock");
221
222/*!	\brief Guards mount/unmount operations.
223
224	The fs_mount() and fs_unmount() hold the lock during their whole operation.
225	That is locking the lock ensures that no FS is mounted/unmounted. In
226	particular this means that
227	- sMountsTable will not be modified,
228	- the fields immutable after initialization of the fs_mount structures in
229	  sMountsTable will not be modified,
230
231	The thread trying to lock the lock must not hold sVnodeLock or
232	sMountLock.
233*/
234static recursive_lock sMountOpLock;
235
236/*!	\brief Guards sVnodeTable.
237
238	The holder is allowed read/write access to sVnodeTable and to
239	any unbusy vnode in that table, save to the immutable fields (device, id,
240	private_node, mount) to which only read-only access is allowed.
241	The mutable fields advisory_locking, mandatory_locked_by, and ref_count, as
242	well as the busy, removed, unused flags, and the vnode's type can also be
243	write accessed when holding a read lock to sVnodeLock *and* having the vnode
244	locked. Write access to covered_by and covers requires to write lock
245	sVnodeLock.
246
247	The thread trying to acquire the lock must not hold sMountLock.
248	You must not hold this lock when calling create_sem(), as this might call
249	vfs_free_unused_vnodes() and thus cause a deadlock.
250*/
251static rw_lock sVnodeLock = RW_LOCK_INITIALIZER("vfs_vnode_lock");
252
253/*!	\brief Guards io_context::root.
254
255	Must be held when setting or getting the io_context::root field.
256	The only operation allowed while holding this lock besides getting or
257	setting the field is inc_vnode_ref_count() on io_context::root.
258*/
259static mutex sIOContextRootLock = MUTEX_INITIALIZER("io_context::root lock");
260
261
262namespace {
263
264struct vnode_hash_key {
265	dev_t	device;
266	ino_t	vnode;
267};
268
269struct VnodeHash {
270	typedef vnode_hash_key	KeyType;
271	typedef	struct vnode	ValueType;
272
273#define VHASH(mountid, vnodeid) \
274	(((uint32)((vnodeid) >> 32) + (uint32)(vnodeid)) ^ (uint32)(mountid))
275
276	size_t HashKey(KeyType key) const
277	{
278		return VHASH(key.device, key.vnode);
279	}
280
281	size_t Hash(ValueType* vnode) const
282	{
283		return VHASH(vnode->device, vnode->id);
284	}
285
286#undef VHASH
287
288	bool Compare(KeyType key, ValueType* vnode) const
289	{
290		return vnode->device == key.device && vnode->id == key.vnode;
291	}
292
293	ValueType*& GetLink(ValueType* value) const
294	{
295		return value->next;
296	}
297};
298
299typedef BOpenHashTable<VnodeHash> VnodeTable;
300
301
302struct MountHash {
303	typedef dev_t			KeyType;
304	typedef	struct fs_mount	ValueType;
305
306	size_t HashKey(KeyType key) const
307	{
308		return key;
309	}
310
311	size_t Hash(ValueType* mount) const
312	{
313		return mount->id;
314	}
315
316	bool Compare(KeyType key, ValueType* mount) const
317	{
318		return mount->id == key;
319	}
320
321	ValueType*& GetLink(ValueType* value) const
322	{
323		return value->next;
324	}
325};
326
327typedef BOpenHashTable<MountHash> MountTable;
328
329} // namespace
330
331
332object_cache* sPathNameCache;
333object_cache* sVnodeCache;
334object_cache* sFileDescriptorCache;
335
336#define VNODE_HASH_TABLE_SIZE 1024
337static VnodeTable* sVnodeTable;
338static struct vnode* sRoot;
339
340#define MOUNTS_HASH_TABLE_SIZE 16
341static MountTable* sMountsTable;
342static dev_t sNextMountID = 1;
343
344#define MAX_TEMP_IO_VECS 8
345
346// How long to wait for busy vnodes (10s)
347#define BUSY_VNODE_RETRIES 2000
348#define BUSY_VNODE_DELAY 5000
349
350mode_t __gUmask = 022;
351
352/* function declarations */
353
354static void free_unused_vnodes();
355
356// file descriptor operation prototypes
357static status_t file_read(struct file_descriptor* descriptor, off_t pos,
358	void* buffer, size_t* _bytes);
359static status_t file_write(struct file_descriptor* descriptor, off_t pos,
360	const void* buffer, size_t* _bytes);
361static off_t file_seek(struct file_descriptor* descriptor, off_t pos,
362	int seekType);
363static void file_free_fd(struct file_descriptor* descriptor);
364static status_t file_close(struct file_descriptor* descriptor);
365static status_t file_select(struct file_descriptor* descriptor, uint8 event,
366	struct selectsync* sync);
367static status_t file_deselect(struct file_descriptor* descriptor, uint8 event,
368	struct selectsync* sync);
369static status_t dir_read(struct io_context* context,
370	struct file_descriptor* descriptor, struct dirent* buffer,
371	size_t bufferSize, uint32* _count);
372static status_t dir_read(struct io_context* ioContext, struct vnode* vnode,
373	void* cookie, struct dirent* buffer, size_t bufferSize, uint32* _count);
374static status_t dir_rewind(struct file_descriptor* descriptor);
375static void dir_free_fd(struct file_descriptor* descriptor);
376static status_t dir_close(struct file_descriptor* descriptor);
377static status_t attr_dir_read(struct io_context* context,
378	struct file_descriptor* descriptor, struct dirent* buffer,
379	size_t bufferSize, uint32* _count);
380static status_t attr_dir_rewind(struct file_descriptor* descriptor);
381static void attr_dir_free_fd(struct file_descriptor* descriptor);
382static status_t attr_dir_close(struct file_descriptor* descriptor);
383static status_t attr_read(struct file_descriptor* descriptor, off_t pos,
384	void* buffer, size_t* _bytes);
385static status_t attr_write(struct file_descriptor* descriptor, off_t pos,
386	const void* buffer, size_t* _bytes);
387static off_t attr_seek(struct file_descriptor* descriptor, off_t pos,
388	int seekType);
389static void attr_free_fd(struct file_descriptor* descriptor);
390static status_t attr_close(struct file_descriptor* descriptor);
391static status_t attr_read_stat(struct file_descriptor* descriptor,
392	struct stat* statData);
393static status_t attr_write_stat(struct file_descriptor* descriptor,
394	const struct stat* stat, int statMask);
395static status_t index_dir_read(struct io_context* context,
396	struct file_descriptor* descriptor, struct dirent* buffer,
397	size_t bufferSize, uint32* _count);
398static status_t index_dir_rewind(struct file_descriptor* descriptor);
399static void index_dir_free_fd(struct file_descriptor* descriptor);
400static status_t index_dir_close(struct file_descriptor* descriptor);
401static status_t query_read(struct io_context* context,
402	struct file_descriptor* descriptor, struct dirent* buffer,
403	size_t bufferSize, uint32* _count);
404static status_t query_rewind(struct file_descriptor* descriptor);
405static void query_free_fd(struct file_descriptor* descriptor);
406static status_t query_close(struct file_descriptor* descriptor);
407
408static status_t common_ioctl(struct file_descriptor* descriptor, ulong op,
409	void* buffer, size_t length);
410static status_t common_read_stat(struct file_descriptor* descriptor,
411	struct stat* statData);
412static status_t common_write_stat(struct file_descriptor* descriptor,
413	const struct stat* statData, int statMask);
414static status_t common_path_read_stat(int fd, char* path, bool traverseLeafLink,
415	struct stat* stat, bool kernel);
416
417static status_t vnode_path_to_vnode(struct vnode* vnode, char* path,
418	bool traverseLeafLink, bool kernel,
419	VnodePutter& _vnode, ino_t* _parentID, char* leafName = NULL);
420static status_t dir_vnode_to_path(struct vnode* vnode, char* buffer,
421	size_t bufferSize, bool kernel);
422static status_t fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
423	VnodePutter& _vnode, ino_t* _parentID, bool kernel);
424static void inc_vnode_ref_count(struct vnode* vnode);
425static status_t dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree,
426	bool reenter);
427static inline void put_vnode(struct vnode* vnode);
428static status_t fs_unmount(char* path, dev_t mountID, uint32 flags,
429	bool kernel);
430static int open_vnode(struct vnode* vnode, int openMode, bool kernel);
431
432
433static struct fd_ops sFileOps = {
434	file_read,
435	file_write,
436	file_seek,
437	common_ioctl,
438	NULL,		// set_flags
439	file_select,
440	file_deselect,
441	NULL,		// read_dir()
442	NULL,		// rewind_dir()
443	common_read_stat,
444	common_write_stat,
445	file_close,
446	file_free_fd
447};
448
449static struct fd_ops sDirectoryOps = {
450	NULL,		// read()
451	NULL,		// write()
452	NULL,		// seek()
453	common_ioctl,
454	NULL,		// set_flags
455	NULL,		// select()
456	NULL,		// deselect()
457	dir_read,
458	dir_rewind,
459	common_read_stat,
460	common_write_stat,
461	dir_close,
462	dir_free_fd
463};
464
465static struct fd_ops sAttributeDirectoryOps = {
466	NULL,		// read()
467	NULL,		// write()
468	NULL,		// seek()
469	common_ioctl,
470	NULL,		// set_flags
471	NULL,		// select()
472	NULL,		// deselect()
473	attr_dir_read,
474	attr_dir_rewind,
475	common_read_stat,
476	common_write_stat,
477	attr_dir_close,
478	attr_dir_free_fd
479};
480
481static struct fd_ops sAttributeOps = {
482	attr_read,
483	attr_write,
484	attr_seek,
485	common_ioctl,
486	NULL,		// set_flags
487	NULL,		// select()
488	NULL,		// deselect()
489	NULL,		// read_dir()
490	NULL,		// rewind_dir()
491	attr_read_stat,
492	attr_write_stat,
493	attr_close,
494	attr_free_fd
495};
496
497static struct fd_ops sIndexDirectoryOps = {
498	NULL,		// read()
499	NULL,		// write()
500	NULL,		// seek()
501	NULL,		// ioctl()
502	NULL,		// set_flags
503	NULL,		// select()
504	NULL,		// deselect()
505	index_dir_read,
506	index_dir_rewind,
507	NULL,		// read_stat()
508	NULL,		// write_stat()
509	index_dir_close,
510	index_dir_free_fd
511};
512
513#if 0
514static struct fd_ops sIndexOps = {
515	NULL,		// read()
516	NULL,		// write()
517	NULL,		// seek()
518	NULL,		// ioctl()
519	NULL,		// set_flags
520	NULL,		// select()
521	NULL,		// deselect()
522	NULL,		// dir_read()
523	NULL,		// dir_rewind()
524	index_read_stat,	// read_stat()
525	NULL,		// write_stat()
526	NULL,		// dir_close()
527	NULL		// free_fd()
528};
529#endif
530
531static struct fd_ops sQueryOps = {
532	NULL,		// read()
533	NULL,		// write()
534	NULL,		// seek()
535	NULL,		// ioctl()
536	NULL,		// set_flags
537	NULL,		// select()
538	NULL,		// deselect()
539	query_read,
540	query_rewind,
541	NULL,		// read_stat()
542	NULL,		// write_stat()
543	query_close,
544	query_free_fd
545};
546
547
548namespace {
549
550class FDCloser {
551public:
552	FDCloser() : fFD(-1), fKernel(true) {}
553
554	FDCloser(int fd, bool kernel) : fFD(fd), fKernel(kernel) {}
555
556	~FDCloser()
557	{
558		Close();
559	}
560
561	void SetTo(int fd, bool kernel)
562	{
563		Close();
564		fFD = fd;
565		fKernel = kernel;
566	}
567
568	void Close()
569	{
570		if (fFD >= 0) {
571			if (fKernel)
572				_kern_close(fFD);
573			else
574				_user_close(fFD);
575			fFD = -1;
576		}
577	}
578
579	int Detach()
580	{
581		int fd = fFD;
582		fFD = -1;
583		return fd;
584	}
585
586private:
587	int		fFD;
588	bool	fKernel;
589};
590
591} // namespace
592
593
594#if VFS_PAGES_IO_TRACING
595
596namespace VFSPagesIOTracing {
597
598class PagesIOTraceEntry : public AbstractTraceEntry {
599protected:
600	PagesIOTraceEntry(struct vnode* vnode, void* cookie, off_t pos,
601		const generic_io_vec* vecs, uint32 count, uint32 flags,
602		generic_size_t bytesRequested, status_t status,
603		generic_size_t bytesTransferred)
604		:
605		fVnode(vnode),
606		fMountID(vnode->mount->id),
607		fNodeID(vnode->id),
608		fCookie(cookie),
609		fPos(pos),
610		fCount(count),
611		fFlags(flags),
612		fBytesRequested(bytesRequested),
613		fStatus(status),
614		fBytesTransferred(bytesTransferred)
615	{
616		fVecs = (generic_io_vec*)alloc_tracing_buffer_memcpy(vecs,
617			sizeof(generic_io_vec) * count, false);
618	}
619
620	void AddDump(TraceOutput& out, const char* mode)
621	{
622		out.Print("vfs pages io %5s: vnode: %p (%" B_PRId32 ", %" B_PRId64 "), "
623			"cookie: %p, pos: %" B_PRIdOFF ", size: %" B_PRIu64 ", vecs: {",
624			mode, fVnode, fMountID, fNodeID, fCookie, fPos,
625			(uint64)fBytesRequested);
626
627		if (fVecs != NULL) {
628			for (uint32 i = 0; i < fCount; i++) {
629				if (i > 0)
630					out.Print(", ");
631				out.Print("(%" B_PRIx64 ", %" B_PRIu64 ")", (uint64)fVecs[i].base,
632					(uint64)fVecs[i].length);
633			}
634		}
635
636		out.Print("}, flags: %#" B_PRIx32 " -> status: %#" B_PRIx32 ", "
637			"transferred: %" B_PRIu64, fFlags, fStatus,
638			(uint64)fBytesTransferred);
639	}
640
641protected:
642	struct vnode*	fVnode;
643	dev_t			fMountID;
644	ino_t			fNodeID;
645	void*			fCookie;
646	off_t			fPos;
647	generic_io_vec*	fVecs;
648	uint32			fCount;
649	uint32			fFlags;
650	generic_size_t	fBytesRequested;
651	status_t		fStatus;
652	generic_size_t	fBytesTransferred;
653};
654
655
656class ReadPages : public PagesIOTraceEntry {
657public:
658	ReadPages(struct vnode* vnode, void* cookie, off_t pos,
659		const generic_io_vec* vecs, uint32 count, uint32 flags,
660		generic_size_t bytesRequested, status_t status,
661		generic_size_t bytesTransferred)
662		:
663		PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
664			bytesRequested, status, bytesTransferred)
665	{
666		Initialized();
667	}
668
669	virtual void AddDump(TraceOutput& out)
670	{
671		PagesIOTraceEntry::AddDump(out, "read");
672	}
673};
674
675
676class WritePages : public PagesIOTraceEntry {
677public:
678	WritePages(struct vnode* vnode, void* cookie, off_t pos,
679		const generic_io_vec* vecs, uint32 count, uint32 flags,
680		generic_size_t bytesRequested, status_t status,
681		generic_size_t bytesTransferred)
682		:
683		PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
684			bytesRequested, status, bytesTransferred)
685	{
686		Initialized();
687	}
688
689	virtual void AddDump(TraceOutput& out)
690	{
691		PagesIOTraceEntry::AddDump(out, "write");
692	}
693};
694
695}	// namespace VFSPagesIOTracing
696
697#	define TPIO(x) new(std::nothrow) VFSPagesIOTracing::x;
698#else
699#	define TPIO(x) ;
700#endif	// VFS_PAGES_IO_TRACING
701
702
703/*! Finds the mounted device (the fs_mount structure) with the given ID.
704	Note, you must hold the sMountLock lock when you call this function.
705*/
706static struct fs_mount*
707find_mount(dev_t id)
708{
709	ASSERT_READ_LOCKED_RW_LOCK(&sMountLock);
710
711	return sMountsTable->Lookup(id);
712}
713
714
715static status_t
716get_mount(dev_t id, struct fs_mount** _mount)
717{
718	struct fs_mount* mount;
719
720	ReadLocker nodeLocker(sVnodeLock);
721	ReadLocker mountLocker(sMountLock);
722
723	mount = find_mount(id);
724	if (mount == NULL)
725		return B_BAD_VALUE;
726
727	struct vnode* rootNode = mount->root_vnode;
728	if (mount->unmounting || rootNode == NULL || rootNode->IsBusy()
729		|| rootNode->ref_count == 0) {
730		// might have been called during a mount/unmount operation
731		return B_BUSY;
732	}
733
734	inc_vnode_ref_count(rootNode);
735	*_mount = mount;
736	return B_OK;
737}
738
739
740static void
741put_mount(struct fs_mount* mount)
742{
743	if (mount)
744		put_vnode(mount->root_vnode);
745}
746
747
748/*!	Tries to open the specified file system module.
749	Accepts a file system name of the form "bfs" or "file_systems/bfs/v1".
750	Returns a pointer to file system module interface, or NULL if it
751	could not open the module.
752*/
753static file_system_module_info*
754get_file_system(const char* fsName)
755{
756	char name[B_FILE_NAME_LENGTH];
757	if (strncmp(fsName, "file_systems/", strlen("file_systems/"))) {
758		// construct module name if we didn't get one
759		// (we currently support only one API)
760		snprintf(name, sizeof(name), "file_systems/%s/v1", fsName);
761		fsName = NULL;
762	}
763
764	file_system_module_info* info;
765	if (get_module(fsName ? fsName : name, (module_info**)&info) != B_OK)
766		return NULL;
767
768	return info;
769}
770
771
772/*!	Accepts a file system name of the form "bfs" or "file_systems/bfs/v1"
773	and returns a compatible fs_info.fsh_name name ("bfs" in both cases).
774	The name is allocated for you, and you have to free() it when you're
775	done with it.
776	Returns NULL if the required memory is not available.
777*/
778static char*
779get_file_system_name(const char* fsName)
780{
781	const size_t length = strlen("file_systems/");
782
783	if (strncmp(fsName, "file_systems/", length)) {
784		// the name already seems to be the module's file name
785		return strdup(fsName);
786	}
787
788	fsName += length;
789	const char* end = strchr(fsName, '/');
790	if (end == NULL) {
791		// this doesn't seem to be a valid name, but well...
792		return strdup(fsName);
793	}
794
795	// cut off the trailing /v1
796
797	char* name = (char*)malloc(end + 1 - fsName);
798	if (name == NULL)
799		return NULL;
800
801	strlcpy(name, fsName, end + 1 - fsName);
802	return name;
803}
804
805
806/*!	Accepts a list of file system names separated by a colon, one for each
807	layer and returns the file system name for the specified layer.
808	The name is allocated for you, and you have to free() it when you're
809	done with it.
810	Returns NULL if the required memory is not available or if there is no
811	name for the specified layer.
812*/
813static char*
814get_file_system_name_for_layer(const char* fsNames, int32 layer)
815{
816	while (layer >= 0) {
817		const char* end = strchr(fsNames, ':');
818		if (end == NULL) {
819			if (layer == 0)
820				return strdup(fsNames);
821			return NULL;
822		}
823
824		if (layer == 0) {
825			size_t length = end - fsNames + 1;
826			char* result = (char*)malloc(length);
827			strlcpy(result, fsNames, length);
828			return result;
829		}
830
831		fsNames = end + 1;
832		layer--;
833	}
834
835	return NULL;
836}
837
838
839static void
840add_vnode_to_mount_list(struct vnode* vnode, struct fs_mount* mount)
841{
842	MutexLocker _(mount->lock);
843	mount->vnodes.Add(vnode);
844}
845
846
847static void
848remove_vnode_from_mount_list(struct vnode* vnode, struct fs_mount* mount)
849{
850	MutexLocker _(mount->lock);
851	mount->vnodes.Remove(vnode);
852}
853
854
855/*!	\brief Looks up a vnode by mount and node ID in the sVnodeTable.
856
857	The caller must hold the sVnodeLock (read lock at least).
858
859	\param mountID the mount ID.
860	\param vnodeID the node ID.
861
862	\return The vnode structure, if it was found in the hash table, \c NULL
863			otherwise.
864*/
865static struct vnode*
866lookup_vnode(dev_t mountID, ino_t vnodeID)
867{
868	ASSERT_READ_LOCKED_RW_LOCK(&sVnodeLock);
869
870	struct vnode_hash_key key;
871
872	key.device = mountID;
873	key.vnode = vnodeID;
874
875	return sVnodeTable->Lookup(key);
876}
877
878
879/*!	\brief Checks whether or not a busy vnode should be waited for (again).
880
881	This will also wait for BUSY_VNODE_DELAY before returning if one should
882	still wait for the vnode becoming unbusy.
883
884	\return \c true if one should retry, \c false if not.
885*/
886static bool
887retry_busy_vnode(int32& tries, dev_t mountID, ino_t vnodeID)
888{
889	if (--tries < 0) {
890		// vnode doesn't seem to become unbusy
891		dprintf("vnode %" B_PRIdDEV ":%" B_PRIdINO
892			" is not becoming unbusy!\n", mountID, vnodeID);
893		return false;
894	}
895	snooze(BUSY_VNODE_DELAY);
896	return true;
897}
898
899
900/*!	Creates a new vnode with the given mount and node ID.
901	If the node already exists, it is returned instead and no new node is
902	created. In either case -- but not, if an error occurs -- the function write
903	locks \c sVnodeLock and keeps it locked for the caller when returning. On
904	error the lock is not held on return.
905
906	\param mountID The mount ID.
907	\param vnodeID The vnode ID.
908	\param _vnode Will be set to the new vnode on success.
909	\param _nodeCreated Will be set to \c true when the returned vnode has
910		been newly created, \c false when it already existed. Will not be
911		changed on error.
912	\return \c B_OK, when the vnode was successfully created and inserted or
913		a node with the given ID was found, \c B_NO_MEMORY or
914		\c B_ENTRY_NOT_FOUND on error.
915*/
916static status_t
917create_new_vnode_and_lock(dev_t mountID, ino_t vnodeID, struct vnode*& _vnode,
918	bool& _nodeCreated)
919{
920	FUNCTION(("create_new_vnode_and_lock()\n"));
921
922	struct vnode* vnode = (struct vnode*)object_cache_alloc(sVnodeCache, 0);
923	if (vnode == NULL)
924		return B_NO_MEMORY;
925
926	// initialize basic values
927	memset(vnode, 0, sizeof(struct vnode));
928	vnode->device = mountID;
929	vnode->id = vnodeID;
930	vnode->ref_count = 1;
931	vnode->SetBusy(true);
932
933	// look up the node -- it might have been added by someone else in the
934	// meantime
935	rw_lock_write_lock(&sVnodeLock);
936	struct vnode* existingVnode = lookup_vnode(mountID, vnodeID);
937	if (existingVnode != NULL) {
938		object_cache_free(sVnodeCache, vnode, 0);
939		_vnode = existingVnode;
940		_nodeCreated = false;
941		return B_OK;
942	}
943
944	// get the mount structure
945	rw_lock_read_lock(&sMountLock);
946	vnode->mount = find_mount(mountID);
947	if (!vnode->mount || vnode->mount->unmounting) {
948		rw_lock_read_unlock(&sMountLock);
949		rw_lock_write_unlock(&sVnodeLock);
950		object_cache_free(sVnodeCache, vnode, 0);
951		return B_ENTRY_NOT_FOUND;
952	}
953
954	// add the vnode to the mount's node list and the hash table
955	sVnodeTable->Insert(vnode);
956	add_vnode_to_mount_list(vnode, vnode->mount);
957
958	rw_lock_read_unlock(&sMountLock);
959
960	_vnode = vnode;
961	_nodeCreated = true;
962
963	// keep the vnode lock locked
964	return B_OK;
965}
966
967
968/*!	Frees the vnode and all resources it has acquired, and removes
969	it from the vnode hash as well as from its mount structure.
970	Will also make sure that any cache modifications are written back.
971*/
972static void
973free_vnode(struct vnode* vnode, bool reenter)
974{
975	ASSERT_PRINT(vnode->ref_count == 0 && vnode->IsBusy(), "vnode: %p\n",
976		vnode);
977	ASSERT_PRINT(vnode->advisory_locking == NULL, "vnode: %p\n", vnode);
978
979	// write back any changes in this vnode's cache -- but only
980	// if the vnode won't be deleted, in which case the changes
981	// will be discarded
982
983	if (!vnode->IsRemoved() && HAS_FS_CALL(vnode, fsync))
984		FS_CALL_NO_PARAMS(vnode, fsync);
985
986	// Note: If this vnode has a cache attached, there will still be two
987	// references to that cache at this point. The last one belongs to the vnode
988	// itself (cf. vfs_get_vnode_cache()) and one belongs to the node's file
989	// cache. Each but the last reference to a cache also includes a reference
990	// to the vnode. The file cache, however, released its reference (cf.
991	// file_cache_create()), so that this vnode's ref count has the chance to
992	// ever drop to 0. Deleting the file cache now, will cause the next to last
993	// cache reference to be released, which will also release a (no longer
994	// existing) vnode reference. To avoid problems, we set the vnode's ref
995	// count, so that it will neither become negative nor 0.
996	vnode->ref_count = 2;
997
998	if (!vnode->IsUnpublished()) {
999		if (vnode->IsRemoved())
1000			FS_CALL(vnode, remove_vnode, reenter);
1001		else
1002			FS_CALL(vnode, put_vnode, reenter);
1003	}
1004
1005	// If the vnode has a VMCache attached, make sure that it won't try to get
1006	// another reference via VMVnodeCache::AcquireUnreferencedStoreRef(). As
1007	// long as the vnode is busy and in the hash, that won't happen, but as
1008	// soon as we've removed it from the hash, it could reload the vnode -- with
1009	// a new cache attached!
1010	if (vnode->cache != NULL && vnode->cache->type == CACHE_TYPE_VNODE)
1011		((VMVnodeCache*)vnode->cache)->VnodeDeleted();
1012
1013	// The file system has removed the resources of the vnode now, so we can
1014	// make it available again (by removing the busy vnode from the hash).
1015	rw_lock_write_lock(&sVnodeLock);
1016	sVnodeTable->Remove(vnode);
1017	rw_lock_write_unlock(&sVnodeLock);
1018
1019	// if we have a VMCache attached, remove it
1020	if (vnode->cache)
1021		vnode->cache->ReleaseRef();
1022
1023	vnode->cache = NULL;
1024
1025	remove_vnode_from_mount_list(vnode, vnode->mount);
1026
1027	object_cache_free(sVnodeCache, vnode, 0);
1028}
1029
1030
1031/*!	\brief Decrements the reference counter of the given vnode and deletes it,
1032	if the counter dropped to 0.
1033
1034	The caller must, of course, own a reference to the vnode to call this
1035	function.
1036	The caller must not hold the sVnodeLock or the sMountLock.
1037
1038	\param vnode the vnode.
1039	\param alwaysFree don't move this vnode into the unused list, but really
1040		   delete it if possible.
1041	\param reenter \c true, if this function is called (indirectly) from within
1042		   a file system. This will be passed to file system hooks only.
1043	\return \c B_OK, if everything went fine, an error code otherwise.
1044*/
1045static status_t
1046dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree, bool reenter)
1047{
1048	ReadLocker locker(sVnodeLock);
1049	AutoLocker<Vnode> nodeLocker(vnode);
1050
1051	int32 oldRefCount = atomic_add(&vnode->ref_count, -1);
1052
1053	ASSERT_PRINT(oldRefCount > 0, "vnode %p\n", vnode);
1054
1055	TRACE(("dec_vnode_ref_count: vnode %p, ref now %" B_PRId32 "\n", vnode,
1056		vnode->ref_count));
1057
1058	if (oldRefCount != 1)
1059		return B_OK;
1060
1061	if (vnode->IsBusy())
1062		panic("dec_vnode_ref_count: called on busy vnode %p\n", vnode);
1063
1064	bool freeNode = false;
1065	bool freeUnusedNodes = false;
1066
1067	// Just insert the vnode into an unused list if we don't need
1068	// to delete it
1069	if (vnode->IsRemoved() || alwaysFree) {
1070		vnode_to_be_freed(vnode);
1071		vnode->SetBusy(true);
1072		freeNode = true;
1073	} else
1074		freeUnusedNodes = vnode_unused(vnode);
1075
1076	nodeLocker.Unlock();
1077	locker.Unlock();
1078
1079	if (freeNode)
1080		free_vnode(vnode, reenter);
1081	else if (freeUnusedNodes)
1082		free_unused_vnodes();
1083
1084	return B_OK;
1085}
1086
1087
1088/*!	\brief Increments the reference counter of the given vnode.
1089
1090	The caller must make sure that the node isn't deleted while this function
1091	is called. This can be done either:
1092	- by ensuring that a reference to the node exists and remains in existence,
1093	  or
1094	- by holding the vnode's lock (which also requires read locking sVnodeLock)
1095	  or by holding sVnodeLock write locked.
1096
1097	In the second case the caller is responsible for dealing with the ref count
1098	0 -> 1 transition. That is 1. this function must not be invoked when the
1099	node is busy in the first place and 2. vnode_used() must be called for the
1100	node.
1101
1102	\param vnode the vnode.
1103*/
1104static void
1105inc_vnode_ref_count(struct vnode* vnode)
1106{
1107	atomic_add(&vnode->ref_count, 1);
1108	TRACE(("inc_vnode_ref_count: vnode %p, ref now %" B_PRId32 "\n", vnode,
1109		vnode->ref_count));
1110}
1111
1112
1113static bool
1114is_special_node_type(int type)
1115{
1116	// at the moment only FIFOs are supported
1117	return S_ISFIFO(type);
1118}
1119
1120
1121static status_t
1122create_special_sub_node(struct vnode* vnode, uint32 flags)
1123{
1124	if (S_ISFIFO(vnode->Type()))
1125		return create_fifo_vnode(vnode->mount->volume, vnode);
1126
1127	return B_BAD_VALUE;
1128}
1129
1130
1131/*!	\brief Retrieves a vnode for a given mount ID, node ID pair.
1132
1133	If the node is not yet in memory, it will be loaded.
1134
1135	The caller must not hold the sVnodeLock or the sMountLock.
1136
1137	\param mountID the mount ID.
1138	\param vnodeID the node ID.
1139	\param _vnode Pointer to a vnode* variable into which the pointer to the
1140		   retrieved vnode structure shall be written.
1141	\param reenter \c true, if this function is called (indirectly) from within
1142		   a file system.
1143	\return \c B_OK, if everything when fine, an error code otherwise.
1144*/
1145static status_t
1146get_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode, bool canWait,
1147	int reenter)
1148{
1149	FUNCTION(("get_vnode: mountid %" B_PRId32 " vnid 0x%" B_PRIx64 " %p\n",
1150		mountID, vnodeID, _vnode));
1151
1152	rw_lock_read_lock(&sVnodeLock);
1153
1154	int32 tries = BUSY_VNODE_RETRIES;
1155restart:
1156	struct vnode* vnode = lookup_vnode(mountID, vnodeID);
1157	AutoLocker<Vnode> nodeLocker(vnode);
1158
1159	if (vnode && vnode->IsBusy()) {
1160		// vnodes in the Removed state (except ones still Unpublished)
1161		// which are also Busy will disappear soon, so we do not wait for them.
1162		const bool doNotWait = vnode->IsRemoved() && !vnode->IsUnpublished();
1163
1164		nodeLocker.Unlock();
1165		rw_lock_read_unlock(&sVnodeLock);
1166		if (!canWait) {
1167			dprintf("vnode %" B_PRIdDEV ":%" B_PRIdINO " is busy!\n",
1168				mountID, vnodeID);
1169			return B_BUSY;
1170		}
1171		if (doNotWait || !retry_busy_vnode(tries, mountID, vnodeID))
1172			return B_BUSY;
1173
1174		rw_lock_read_lock(&sVnodeLock);
1175		goto restart;
1176	}
1177
1178	TRACE(("get_vnode: tried to lookup vnode, got %p\n", vnode));
1179
1180	status_t status;
1181
1182	if (vnode) {
1183		if (vnode->ref_count == 0) {
1184			// this vnode has been unused before
1185			vnode_used(vnode);
1186		}
1187		inc_vnode_ref_count(vnode);
1188
1189		nodeLocker.Unlock();
1190		rw_lock_read_unlock(&sVnodeLock);
1191	} else {
1192		// we need to create a new vnode and read it in
1193		rw_lock_read_unlock(&sVnodeLock);
1194			// unlock -- create_new_vnode_and_lock() write-locks on success
1195		bool nodeCreated;
1196		status = create_new_vnode_and_lock(mountID, vnodeID, vnode,
1197			nodeCreated);
1198		if (status != B_OK)
1199			return status;
1200
1201		if (!nodeCreated) {
1202			rw_lock_read_lock(&sVnodeLock);
1203			rw_lock_write_unlock(&sVnodeLock);
1204			goto restart;
1205		}
1206
1207		rw_lock_write_unlock(&sVnodeLock);
1208
1209		int type;
1210		uint32 flags;
1211		status = FS_MOUNT_CALL(vnode->mount, get_vnode, vnodeID, vnode, &type,
1212			&flags, reenter);
1213		if (status == B_OK && vnode->private_node == NULL)
1214			status = B_BAD_VALUE;
1215
1216		bool gotNode = status == B_OK;
1217		bool publishSpecialSubNode = false;
1218		if (gotNode) {
1219			vnode->SetType(type);
1220			publishSpecialSubNode = is_special_node_type(type)
1221				&& (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
1222		}
1223
1224		if (gotNode && publishSpecialSubNode)
1225			status = create_special_sub_node(vnode, flags);
1226
1227		if (status != B_OK) {
1228			if (gotNode)
1229				FS_CALL(vnode, put_vnode, reenter);
1230
1231			rw_lock_write_lock(&sVnodeLock);
1232			sVnodeTable->Remove(vnode);
1233			remove_vnode_from_mount_list(vnode, vnode->mount);
1234			rw_lock_write_unlock(&sVnodeLock);
1235
1236			object_cache_free(sVnodeCache, vnode, 0);
1237			return status;
1238		}
1239
1240		rw_lock_read_lock(&sVnodeLock);
1241		vnode->Lock();
1242
1243		vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
1244		vnode->SetBusy(false);
1245
1246		vnode->Unlock();
1247		rw_lock_read_unlock(&sVnodeLock);
1248	}
1249
1250	TRACE(("get_vnode: returning %p\n", vnode));
1251
1252	*_vnode = vnode;
1253	return B_OK;
1254}
1255
1256
1257/*!	\brief Decrements the reference counter of the given vnode and deletes it,
1258	if the counter dropped to 0.
1259
1260	The caller must, of course, own a reference to the vnode to call this
1261	function.
1262	The caller must not hold the sVnodeLock or the sMountLock.
1263
1264	\param vnode the vnode.
1265*/
1266static inline void
1267put_vnode(struct vnode* vnode)
1268{
1269	dec_vnode_ref_count(vnode, false, false);
1270}
1271
1272
1273static void
1274free_unused_vnodes(int32 level)
1275{
1276	unused_vnodes_check_started();
1277
1278	if (level == B_NO_LOW_RESOURCE) {
1279		unused_vnodes_check_done();
1280		return;
1281	}
1282
1283	flush_hot_vnodes();
1284
1285	// determine how many nodes to free
1286	uint32 count = 1;
1287	{
1288		MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1289
1290		switch (level) {
1291			case B_LOW_RESOURCE_NOTE:
1292				count = sUnusedVnodes / 100;
1293				break;
1294			case B_LOW_RESOURCE_WARNING:
1295				count = sUnusedVnodes / 10;
1296				break;
1297			case B_LOW_RESOURCE_CRITICAL:
1298				count = sUnusedVnodes;
1299				break;
1300		}
1301
1302		if (count > sUnusedVnodes)
1303			count = sUnusedVnodes;
1304	}
1305
1306	// Write back the modified pages of some unused vnodes and free them.
1307
1308	for (uint32 i = 0; i < count; i++) {
1309		ReadLocker vnodesReadLocker(sVnodeLock);
1310
1311		// get the first node
1312		MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1313		struct vnode* vnode = (struct vnode*)list_get_first_item(
1314			&sUnusedVnodeList);
1315		unusedVnodesLocker.Unlock();
1316
1317		if (vnode == NULL)
1318			break;
1319
1320		// lock the node
1321		AutoLocker<Vnode> nodeLocker(vnode);
1322
1323		// Check whether the node is still unused -- since we only append to the
1324		// tail of the unused queue, the vnode should still be at its head.
1325		// Alternatively we could check its ref count for 0 and its busy flag,
1326		// but if the node is no longer at the head of the queue, it means it
1327		// has been touched in the meantime, i.e. it is no longer the least
1328		// recently used unused vnode and we rather don't free it.
1329		unusedVnodesLocker.Lock();
1330		if (vnode != list_get_first_item(&sUnusedVnodeList))
1331			continue;
1332		unusedVnodesLocker.Unlock();
1333
1334		ASSERT(!vnode->IsBusy());
1335
1336		// grab a reference
1337		inc_vnode_ref_count(vnode);
1338		vnode_used(vnode);
1339
1340		// write back changes and free the node
1341		nodeLocker.Unlock();
1342		vnodesReadLocker.Unlock();
1343
1344		if (vnode->cache != NULL)
1345			vnode->cache->WriteModified();
1346
1347		dec_vnode_ref_count(vnode, true, false);
1348			// this should free the vnode when it's still unused
1349	}
1350
1351	unused_vnodes_check_done();
1352}
1353
1354
1355/*!	Gets the vnode the given vnode is covering.
1356
1357	The caller must have \c sVnodeLock read-locked at least.
1358
1359	The function returns a reference to the retrieved vnode (if any), the caller
1360	is responsible to free.
1361
1362	\param vnode The vnode whose covered node shall be returned.
1363	\return The covered vnode, or \c NULL if the given vnode doesn't cover any
1364		vnode.
1365*/
1366static inline Vnode*
1367get_covered_vnode_locked(Vnode* vnode)
1368{
1369	if (Vnode* coveredNode = vnode->covers) {
1370		while (coveredNode->covers != NULL)
1371			coveredNode = coveredNode->covers;
1372
1373		inc_vnode_ref_count(coveredNode);
1374		return coveredNode;
1375	}
1376
1377	return NULL;
1378}
1379
1380
1381/*!	Gets the vnode the given vnode is covering.
1382
1383	The caller must not hold \c sVnodeLock. Note that this implies a race
1384	condition, since the situation can change at any time.
1385
1386	The function returns a reference to the retrieved vnode (if any), the caller
1387	is responsible to free.
1388
1389	\param vnode The vnode whose covered node shall be returned.
1390	\return The covered vnode, or \c NULL if the given vnode doesn't cover any
1391		vnode.
1392*/
1393static inline Vnode*
1394get_covered_vnode(Vnode* vnode)
1395{
1396	if (!vnode->IsCovering())
1397		return NULL;
1398
1399	ReadLocker vnodeReadLocker(sVnodeLock);
1400	return get_covered_vnode_locked(vnode);
1401}
1402
1403
1404/*!	Gets the vnode the given vnode is covered by.
1405
1406	The caller must have \c sVnodeLock read-locked at least.
1407
1408	The function returns a reference to the retrieved vnode (if any), the caller
1409	is responsible to free.
1410
1411	\param vnode The vnode whose covering node shall be returned.
1412	\return The covering vnode, or \c NULL if the given vnode isn't covered by
1413		any vnode.
1414*/
1415static Vnode*
1416get_covering_vnode_locked(Vnode* vnode)
1417{
1418	if (Vnode* coveringNode = vnode->covered_by) {
1419		while (coveringNode->covered_by != NULL)
1420			coveringNode = coveringNode->covered_by;
1421
1422		inc_vnode_ref_count(coveringNode);
1423		return coveringNode;
1424	}
1425
1426	return NULL;
1427}
1428
1429
1430/*!	Gets the vnode the given vnode is covered by.
1431
1432	The caller must not hold \c sVnodeLock. Note that this implies a race
1433	condition, since the situation can change at any time.
1434
1435	The function returns a reference to the retrieved vnode (if any), the caller
1436	is responsible to free.
1437
1438	\param vnode The vnode whose covering node shall be returned.
1439	\return The covering vnode, or \c NULL if the given vnode isn't covered by
1440		any vnode.
1441*/
1442static inline Vnode*
1443get_covering_vnode(Vnode* vnode)
1444{
1445	if (!vnode->IsCovered())
1446		return NULL;
1447
1448	ReadLocker vnodeReadLocker(sVnodeLock);
1449	return get_covering_vnode_locked(vnode);
1450}
1451
1452
1453static void
1454free_unused_vnodes()
1455{
1456	free_unused_vnodes(
1457		low_resource_state(B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1458			| B_KERNEL_RESOURCE_ADDRESS_SPACE));
1459}
1460
1461
1462static void
1463vnode_low_resource_handler(void* /*data*/, uint32 resources, int32 level)
1464{
1465	TRACE(("vnode_low_resource_handler(level = %" B_PRId32 ")\n", level));
1466
1467	free_unused_vnodes(level);
1468}
1469
1470
1471static inline void
1472put_advisory_locking(struct advisory_locking* locking)
1473{
1474	release_sem(locking->lock);
1475}
1476
1477
1478/*!	Returns the advisory_locking object of the \a vnode in case it
1479	has one, and locks it.
1480	You have to call put_advisory_locking() when you're done with
1481	it.
1482	Note, you must not have the vnode mutex locked when calling
1483	this function.
1484*/
1485static struct advisory_locking*
1486get_advisory_locking(struct vnode* vnode)
1487{
1488	rw_lock_read_lock(&sVnodeLock);
1489	vnode->Lock();
1490
1491	struct advisory_locking* locking = vnode->advisory_locking;
1492	sem_id lock = locking != NULL ? locking->lock : B_ERROR;
1493
1494	vnode->Unlock();
1495	rw_lock_read_unlock(&sVnodeLock);
1496
1497	if (lock >= 0)
1498		lock = acquire_sem(lock);
1499	if (lock < 0) {
1500		// This means the locking has been deleted in the mean time
1501		// or had never existed in the first place - otherwise, we
1502		// would get the lock at some point.
1503		return NULL;
1504	}
1505
1506	return locking;
1507}
1508
1509
1510/*!	Creates a locked advisory_locking object, and attaches it to the
1511	given \a vnode.
1512	Returns B_OK in case of success - also if the vnode got such an
1513	object from someone else in the mean time, you'll still get this
1514	one locked then.
1515*/
1516static status_t
1517create_advisory_locking(struct vnode* vnode)
1518{
1519	if (vnode == NULL)
1520		return B_FILE_ERROR;
1521
1522	ObjectDeleter<advisory_locking> lockingDeleter;
1523	struct advisory_locking* locking = NULL;
1524
1525	while (get_advisory_locking(vnode) == NULL) {
1526		// no locking object set on the vnode yet, create one
1527		if (locking == NULL) {
1528			locking = new(std::nothrow) advisory_locking;
1529			if (locking == NULL)
1530				return B_NO_MEMORY;
1531			lockingDeleter.SetTo(locking);
1532
1533			locking->wait_sem = create_sem(0, "advisory lock");
1534			if (locking->wait_sem < 0)
1535				return locking->wait_sem;
1536
1537			locking->lock = create_sem(0, "advisory locking");
1538			if (locking->lock < 0)
1539				return locking->lock;
1540		}
1541
1542		// set our newly created locking object
1543		ReadLocker _(sVnodeLock);
1544		AutoLocker<Vnode> nodeLocker(vnode);
1545		if (vnode->advisory_locking == NULL) {
1546			vnode->advisory_locking = locking;
1547			lockingDeleter.Detach();
1548			return B_OK;
1549		}
1550	}
1551
1552	// The vnode already had a locking object. That's just as well.
1553
1554	return B_OK;
1555}
1556
1557
1558/*! Returns \c true when either \a flock is \c NULL or the \a flock intersects
1559	with the advisory_lock \a lock.
1560*/
1561static bool
1562advisory_lock_intersects(struct advisory_lock* lock, struct flock* flock)
1563{
1564	if (flock == NULL)
1565		return true;
1566
1567	return lock->start <= flock->l_start - 1 + flock->l_len
1568		&& lock->end >= flock->l_start;
1569}
1570
1571
1572/*!	Tests whether acquiring a lock would block.
1573*/
1574static status_t
1575test_advisory_lock(struct vnode* vnode, struct flock* flock)
1576{
1577	flock->l_type = F_UNLCK;
1578
1579	struct advisory_locking* locking = get_advisory_locking(vnode);
1580	if (locking == NULL)
1581		return B_OK;
1582
1583	team_id team = team_get_current_team_id();
1584
1585	LockList::Iterator iterator = locking->locks.GetIterator();
1586	while (iterator.HasNext()) {
1587		struct advisory_lock* lock = iterator.Next();
1588
1589		 if (lock->team != team && advisory_lock_intersects(lock, flock)) {
1590			// locks do overlap
1591			if (flock->l_type != F_RDLCK || !lock->shared) {
1592				// collision
1593				flock->l_type = lock->shared ? F_RDLCK : F_WRLCK;
1594				flock->l_whence = SEEK_SET;
1595				flock->l_start = lock->start;
1596				flock->l_len = lock->end - lock->start + 1;
1597				flock->l_pid = lock->team;
1598				break;
1599			}
1600		}
1601	}
1602
1603	put_advisory_locking(locking);
1604	return B_OK;
1605}
1606
1607
1608/*!	Removes the specified lock, or all locks of the calling team
1609	if \a flock is NULL.
1610*/
1611static status_t
1612release_advisory_lock(struct vnode* vnode, struct io_context* context,
1613	struct file_descriptor* descriptor, struct flock* flock)
1614{
1615	FUNCTION(("release_advisory_lock(vnode = %p, flock = %p)\n", vnode, flock));
1616
1617	struct advisory_locking* locking = get_advisory_locking(vnode);
1618	if (locking == NULL)
1619		return B_OK;
1620
1621	// find matching lock entries
1622
1623	LockList::Iterator iterator = locking->locks.GetIterator();
1624	while (iterator.HasNext()) {
1625		struct advisory_lock* lock = iterator.Next();
1626		bool removeLock = false;
1627
1628		if (descriptor != NULL && lock->bound_to == descriptor) {
1629			// Remove flock() locks
1630			removeLock = true;
1631		} else if (lock->bound_to == context
1632				&& advisory_lock_intersects(lock, flock)) {
1633			// Remove POSIX locks
1634			bool endsBeyond = false;
1635			bool startsBefore = false;
1636			if (flock != NULL) {
1637				startsBefore = lock->start < flock->l_start;
1638				endsBeyond = lock->end > flock->l_start - 1 + flock->l_len;
1639			}
1640
1641			if (!startsBefore && !endsBeyond) {
1642				// lock is completely contained in flock
1643				removeLock = true;
1644			} else if (startsBefore && !endsBeyond) {
1645				// cut the end of the lock
1646				lock->end = flock->l_start - 1;
1647			} else if (!startsBefore && endsBeyond) {
1648				// cut the start of the lock
1649				lock->start = flock->l_start + flock->l_len;
1650			} else {
1651				// divide the lock into two locks
1652				struct advisory_lock* secondLock = new advisory_lock;
1653				if (secondLock == NULL) {
1654					// TODO: we should probably revert the locks we already
1655					// changed... (ie. allocate upfront)
1656					put_advisory_locking(locking);
1657					return B_NO_MEMORY;
1658				}
1659
1660				lock->end = flock->l_start - 1;
1661
1662				secondLock->bound_to = context;
1663				secondLock->team = lock->team;
1664				secondLock->session = lock->session;
1665				// values must already be normalized when getting here
1666				secondLock->start = flock->l_start + flock->l_len;
1667				secondLock->end = lock->end;
1668				secondLock->shared = lock->shared;
1669
1670				locking->locks.Add(secondLock);
1671			}
1672		}
1673
1674		if (removeLock) {
1675			// this lock is no longer used
1676			iterator.Remove();
1677			delete lock;
1678		}
1679	}
1680
1681	bool removeLocking = locking->locks.IsEmpty();
1682	release_sem_etc(locking->wait_sem, 1, B_RELEASE_ALL);
1683
1684	put_advisory_locking(locking);
1685
1686	if (removeLocking) {
1687		// We can remove the whole advisory locking structure; it's no
1688		// longer used
1689		locking = get_advisory_locking(vnode);
1690		if (locking != NULL) {
1691			ReadLocker locker(sVnodeLock);
1692			AutoLocker<Vnode> nodeLocker(vnode);
1693
1694			// the locking could have been changed in the mean time
1695			if (locking->locks.IsEmpty()) {
1696				vnode->advisory_locking = NULL;
1697				nodeLocker.Unlock();
1698				locker.Unlock();
1699
1700				// we've detached the locking from the vnode, so we can
1701				// safely delete it
1702				delete locking;
1703			} else {
1704				// the locking is in use again
1705				nodeLocker.Unlock();
1706				locker.Unlock();
1707				release_sem_etc(locking->lock, 1, B_DO_NOT_RESCHEDULE);
1708			}
1709		}
1710	}
1711
1712	return B_OK;
1713}
1714
1715
1716/*!	Acquires an advisory lock for the \a vnode. If \a wait is \c true, it
1717	will wait for the lock to become available, if there are any collisions
1718	(it will return B_PERMISSION_DENIED in this case if \a wait is \c false).
1719
1720	If \a descriptor is NULL, POSIX semantics are used for this lock. Otherwise,
1721	BSD flock() semantics are used, that is, all children can unlock the file
1722	in question (we even allow parents to remove the lock, though, but that
1723	seems to be in line to what the BSD's are doing).
1724*/
1725static status_t
1726acquire_advisory_lock(struct vnode* vnode, io_context* context,
1727	struct file_descriptor* descriptor, struct flock* flock, bool wait)
1728{
1729	FUNCTION(("acquire_advisory_lock(vnode = %p, flock = %p, wait = %s)\n",
1730		vnode, flock, wait ? "yes" : "no"));
1731
1732	bool shared = flock->l_type == F_RDLCK;
1733	void* boundTo = descriptor != NULL ? (void*)descriptor : (void*)context;
1734	status_t status = B_OK;
1735
1736	// TODO: do deadlock detection!
1737
1738	struct advisory_locking* locking;
1739
1740	while (true) {
1741		// if this vnode has an advisory_locking structure attached,
1742		// lock that one and search for any colliding file lock
1743		status = create_advisory_locking(vnode);
1744		if (status != B_OK)
1745			return status;
1746
1747		locking = vnode->advisory_locking;
1748		team_id team = team_get_current_team_id();
1749		sem_id waitForLock = -1;
1750
1751		// test for collisions
1752		LockList::Iterator iterator = locking->locks.GetIterator();
1753		while (iterator.HasNext()) {
1754			struct advisory_lock* lock = iterator.Next();
1755
1756			// TODO: locks from the same team might be joinable!
1757			if ((lock->team != team || lock->bound_to != boundTo)
1758					&& advisory_lock_intersects(lock, flock)) {
1759				// locks do overlap
1760				if (!shared || !lock->shared) {
1761					// we need to wait
1762					waitForLock = locking->wait_sem;
1763					break;
1764				}
1765			}
1766		}
1767
1768		if (waitForLock < 0)
1769			break;
1770
1771		// We need to wait. Do that or fail now, if we've been asked not to.
1772
1773		if (!wait) {
1774			put_advisory_locking(locking);
1775			return descriptor != NULL ? B_WOULD_BLOCK : B_PERMISSION_DENIED;
1776		}
1777
1778		status = switch_sem_etc(locking->lock, waitForLock, 1,
1779			B_CAN_INTERRUPT, 0);
1780		if (status != B_OK && status != B_BAD_SEM_ID)
1781			return status;
1782
1783		// We have been notified, but we need to re-lock the locking object. So
1784		// go another round...
1785	}
1786
1787	// install new lock
1788
1789	struct advisory_lock* lock = new(std::nothrow) advisory_lock;
1790	if (lock == NULL) {
1791		put_advisory_locking(locking);
1792		return B_NO_MEMORY;
1793	}
1794
1795	lock->bound_to = boundTo;
1796	lock->team = team_get_current_team_id();
1797	lock->session = thread_get_current_thread()->team->session_id;
1798	// values must already be normalized when getting here
1799	lock->start = flock->l_start;
1800	lock->end = flock->l_start - 1 + flock->l_len;
1801	lock->shared = shared;
1802
1803	locking->locks.Add(lock);
1804	put_advisory_locking(locking);
1805
1806	return status;
1807}
1808
1809
1810/*!	Normalizes the \a flock structure to make it easier to compare the
1811	structure with others. The l_start and l_len fields are set to absolute
1812	values according to the l_whence field.
1813*/
1814static status_t
1815normalize_flock(struct file_descriptor* descriptor, struct flock* flock)
1816{
1817	switch (flock->l_whence) {
1818		case SEEK_SET:
1819			break;
1820		case SEEK_CUR:
1821			flock->l_start += descriptor->pos;
1822			break;
1823		case SEEK_END:
1824		{
1825			struct vnode* vnode = descriptor->u.vnode;
1826			struct stat stat;
1827			status_t status;
1828
1829			if (!HAS_FS_CALL(vnode, read_stat))
1830				return B_UNSUPPORTED;
1831
1832			status = FS_CALL(vnode, read_stat, &stat);
1833			if (status != B_OK)
1834				return status;
1835
1836			flock->l_start += stat.st_size;
1837			break;
1838		}
1839		default:
1840			return B_BAD_VALUE;
1841	}
1842
1843	if (flock->l_start < 0)
1844		flock->l_start = 0;
1845	if (flock->l_len == 0)
1846		flock->l_len = OFF_MAX;
1847
1848	// don't let the offset and length overflow
1849	if (flock->l_start > 0 && OFF_MAX - flock->l_start < flock->l_len)
1850		flock->l_len = OFF_MAX - flock->l_start;
1851
1852	if (flock->l_len < 0) {
1853		// a negative length reverses the region
1854		flock->l_start += flock->l_len;
1855		flock->l_len = -flock->l_len;
1856	}
1857
1858	return B_OK;
1859}
1860
1861
1862static void
1863replace_vnode_if_disconnected(struct fs_mount* mount,
1864	struct vnode* vnodeToDisconnect, struct vnode*& vnode,
1865	struct vnode* fallBack, bool lockRootLock)
1866{
1867	struct vnode* givenVnode = vnode;
1868	bool vnodeReplaced = false;
1869
1870	ReadLocker vnodeReadLocker(sVnodeLock);
1871
1872	if (lockRootLock)
1873		mutex_lock(&sIOContextRootLock);
1874
1875	while (vnode != NULL && vnode->mount == mount
1876		&& (vnodeToDisconnect == NULL || vnodeToDisconnect == vnode)) {
1877		if (vnode->covers != NULL) {
1878			// redirect the vnode to the covered vnode
1879			vnode = vnode->covers;
1880		} else
1881			vnode = fallBack;
1882
1883		vnodeReplaced = true;
1884	}
1885
1886	// If we've replaced the node, grab a reference for the new one.
1887	if (vnodeReplaced && vnode != NULL)
1888		inc_vnode_ref_count(vnode);
1889
1890	if (lockRootLock)
1891		mutex_unlock(&sIOContextRootLock);
1892
1893	vnodeReadLocker.Unlock();
1894
1895	if (vnodeReplaced)
1896		put_vnode(givenVnode);
1897}
1898
1899
1900/*!	Disconnects all file descriptors that are associated with the
1901	\a vnodeToDisconnect, or if this is NULL, all vnodes of the specified
1902	\a mount object.
1903
1904	Note, after you've called this function, there might still be ongoing
1905	accesses - they won't be interrupted if they already happened before.
1906	However, any subsequent access will fail.
1907
1908	This is not a cheap function and should be used with care and rarely.
1909	TODO: there is currently no means to stop a blocking read/write!
1910*/
1911static void
1912disconnect_mount_or_vnode_fds(struct fs_mount* mount,
1913	struct vnode* vnodeToDisconnect)
1914{
1915	// iterate over all teams and peek into their file descriptors
1916	TeamListIterator teamIterator;
1917	while (Team* team = teamIterator.Next()) {
1918		BReference<Team> teamReference(team, true);
1919		TeamLocker teamLocker(team);
1920
1921		// lock the I/O context
1922		io_context* context = team->io_context;
1923		if (context == NULL)
1924			continue;
1925		MutexLocker contextLocker(context->io_mutex);
1926
1927		teamLocker.Unlock();
1928
1929		replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->root,
1930			sRoot, true);
1931		replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->cwd,
1932			sRoot, false);
1933
1934		for (uint32 i = 0; i < context->table_size; i++) {
1935			struct file_descriptor* descriptor = context->fds[i];
1936			if (descriptor == NULL || (descriptor->open_mode & O_DISCONNECTED) != 0)
1937				continue;
1938
1939			inc_fd_ref_count(descriptor);
1940
1941			// if this descriptor points at this mount, we
1942			// need to disconnect it to be able to unmount
1943			struct vnode* vnode = fd_vnode(descriptor);
1944			if (vnodeToDisconnect != NULL) {
1945				if (vnode == vnodeToDisconnect)
1946					disconnect_fd(descriptor);
1947			} else if ((vnode != NULL && vnode->mount == mount)
1948				|| (vnode == NULL && descriptor->u.mount == mount))
1949				disconnect_fd(descriptor);
1950
1951			put_fd(descriptor);
1952		}
1953	}
1954}
1955
1956
1957/*!	\brief Gets the root node of the current IO context.
1958	If \a kernel is \c true, the kernel IO context will be used.
1959	The caller obtains a reference to the returned node.
1960*/
1961struct vnode*
1962get_root_vnode(bool kernel)
1963{
1964	if (!kernel) {
1965		// Get current working directory from io context
1966		struct io_context* context = get_current_io_context(kernel);
1967
1968		mutex_lock(&sIOContextRootLock);
1969
1970		struct vnode* root = context->root;
1971		if (root != NULL)
1972			inc_vnode_ref_count(root);
1973
1974		mutex_unlock(&sIOContextRootLock);
1975
1976		if (root != NULL)
1977			return root;
1978
1979		// That should never happen.
1980		dprintf("get_root_vnode(): IO context for team %" B_PRId32 " doesn't "
1981			"have a root\n", team_get_current_team_id());
1982	}
1983
1984	inc_vnode_ref_count(sRoot);
1985	return sRoot;
1986}
1987
1988
1989/*!	\brief Gets the directory path and leaf name for a given path.
1990
1991	The supplied \a path is transformed to refer to the directory part of
1992	the entry identified by the original path, and into the buffer \a filename
1993	the leaf name of the original entry is written.
1994	Neither the returned path nor the leaf name can be expected to be
1995	canonical.
1996
1997	\param path The path to be analyzed. Must be able to store at least one
1998		   additional character.
1999	\param filename The buffer into which the leaf name will be written.
2000		   Must be of size B_FILE_NAME_LENGTH at least.
2001	\return \c B_OK, if everything went fine, \c B_NAME_TOO_LONG, if the leaf
2002		   name is longer than \c B_FILE_NAME_LENGTH, or \c B_ENTRY_NOT_FOUND,
2003		   if the given path name is empty.
2004*/
2005static status_t
2006get_dir_path_and_leaf(char* path, char* filename)
2007{
2008	if (*path == '\0')
2009		return B_ENTRY_NOT_FOUND;
2010
2011	char* last = strrchr(path, '/');
2012		// '/' are not allowed in file names!
2013
2014	FUNCTION(("get_dir_path_and_leaf(path = %s)\n", path));
2015
2016	if (last == NULL) {
2017		// this path is single segment with no '/' in it
2018		// ex. "foo"
2019		if (strlcpy(filename, path, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2020			return B_NAME_TOO_LONG;
2021
2022		strcpy(path, ".");
2023	} else {
2024		last++;
2025		if (last[0] == '\0') {
2026			// special case: the path ends in one or more '/' - remove them
2027			while (*--last == '/' && last != path);
2028			last[1] = '\0';
2029
2030			if (last == path && last[0] == '/') {
2031				// This path points to the root of the file system
2032				strcpy(filename, ".");
2033				return B_OK;
2034			}
2035			for (; last != path && *(last - 1) != '/'; last--);
2036				// rewind to the start of the leaf before the '/'
2037		}
2038
2039		// normal leaf: replace the leaf portion of the path with a '.'
2040		if (strlcpy(filename, last, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2041			return B_NAME_TOO_LONG;
2042
2043		last[0] = '.';
2044		last[1] = '\0';
2045	}
2046	return B_OK;
2047}
2048
2049
2050static status_t
2051entry_ref_to_vnode(dev_t mountID, ino_t directoryID, const char* name,
2052	bool traverse, bool kernel, VnodePutter& _vnode)
2053{
2054	char clonedName[B_FILE_NAME_LENGTH + 1];
2055	if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2056		return B_NAME_TOO_LONG;
2057
2058	// get the directory vnode and let vnode_path_to_vnode() do the rest
2059	struct vnode* directory;
2060
2061	status_t status = get_vnode(mountID, directoryID, &directory, true, false);
2062	if (status < 0)
2063		return status;
2064
2065	return vnode_path_to_vnode(directory, clonedName, traverse, kernel,
2066		_vnode, NULL);
2067}
2068
2069
2070/*!	Looks up the entry with name \a name in the directory represented by \a dir
2071	and returns the respective vnode.
2072	On success a reference to the vnode is acquired for the caller.
2073*/
2074static status_t
2075lookup_dir_entry(struct vnode* dir, const char* name, struct vnode** _vnode)
2076{
2077	ino_t id;
2078	bool missing;
2079
2080	if (dir->mount->entry_cache.Lookup(dir->id, name, id, missing)) {
2081		return missing ? B_ENTRY_NOT_FOUND
2082			: get_vnode(dir->device, id, _vnode, true, false);
2083	}
2084
2085	status_t status = FS_CALL(dir, lookup, name, &id);
2086	if (status != B_OK)
2087		return status;
2088
2089	// The lookup() hook calls get_vnode() or publish_vnode(), so we do already
2090	// have a reference and just need to look the node up.
2091	rw_lock_read_lock(&sVnodeLock);
2092	*_vnode = lookup_vnode(dir->device, id);
2093	rw_lock_read_unlock(&sVnodeLock);
2094
2095	if (*_vnode == NULL) {
2096		panic("lookup_dir_entry(): could not lookup vnode (mountid 0x%" B_PRIx32
2097			" vnid 0x%" B_PRIx64 ")\n", dir->device, id);
2098		return B_ENTRY_NOT_FOUND;
2099	}
2100
2101//	ktrace_printf("lookup_dir_entry(): dir: %p (%ld, %lld), name: \"%s\" -> "
2102//		"%p (%ld, %lld)", dir, dir->mount->id, dir->id, name, *_vnode,
2103//		(*_vnode)->mount->id, (*_vnode)->id);
2104
2105	return B_OK;
2106}
2107
2108
2109/*!	Returns the vnode for the relative \a path starting at the specified \a vnode.
2110
2111	\param[in,out] path The relative path being searched. Must not be NULL.
2112	If the function returns successfully, \a path contains the name of the last path
2113	component. This function clobbers the buffer pointed to by \a path only
2114	if it does contain more than one component.
2115
2116	If the function fails and leafName is not NULL, \a _vnode contains the last directory,
2117	the caller has the responsibility to call put_vnode() on it.
2118
2119	Note, this reduces the ref_count of the starting \a vnode, no matter if
2120	it is successful or not!
2121
2122	\param[out] _vnode If the function returns B_OK, points to the found node.
2123	\param[out] _vnode If the function returns something else and leafname is not NULL: set to the
2124		last existing directory in the path. The caller has responsibility to release it using
2125		put_vnode().
2126	\param[out] _vnode If the function returns something else and leafname is NULL: not used.
2127*/
2128static status_t
2129vnode_path_to_vnode(struct vnode* start, char* path, bool traverseLeafLink,
2130	int count, struct io_context* ioContext, VnodePutter& _vnode,
2131	ino_t* _parentID, char* leafName)
2132{
2133	FUNCTION(("vnode_path_to_vnode(vnode = %p, path = %s)\n", vnode, path));
2134	ASSERT(!_vnode.IsSet());
2135
2136	VnodePutter vnode(start);
2137
2138	if (path == NULL)
2139		return B_BAD_VALUE;
2140	if (*path == '\0')
2141		return B_ENTRY_NOT_FOUND;
2142
2143	status_t status = B_OK;
2144	ino_t lastParentID = vnode->id;
2145	while (true) {
2146		char* nextPath;
2147
2148		TRACE(("vnode_path_to_vnode: top of loop. p = %p, p = '%s'\n", path,
2149			path));
2150
2151		// done?
2152		if (path[0] == '\0')
2153			break;
2154
2155		// walk to find the next path component ("path" will point to a single
2156		// path component), and filter out multiple slashes
2157		for (nextPath = path + 1; *nextPath != '\0' && *nextPath != '/';
2158				nextPath++);
2159
2160		bool directoryFound = false;
2161		if (*nextPath == '/') {
2162			directoryFound = true;
2163			*nextPath = '\0';
2164			do
2165				nextPath++;
2166			while (*nextPath == '/');
2167		}
2168
2169		// See if the '..' is at a covering vnode move to the covered
2170		// vnode so we pass the '..' path to the underlying filesystem.
2171		// Also prevent breaking the root of the IO context.
2172		if (strcmp("..", path) == 0) {
2173			if (vnode.Get() == ioContext->root) {
2174				// Attempted prison break! Keep it contained.
2175				path = nextPath;
2176				continue;
2177			}
2178
2179			if (Vnode* coveredVnode = get_covered_vnode(vnode.Get()))
2180				vnode.SetTo(coveredVnode);
2181		}
2182
2183		// check if vnode is really a directory
2184		if (status == B_OK && !S_ISDIR(vnode->Type()))
2185			status = B_NOT_A_DIRECTORY;
2186
2187		// Check if we have the right to search the current directory vnode.
2188		// If a file system doesn't have the access() function, we assume that
2189		// searching a directory is always allowed
2190		if (status == B_OK && HAS_FS_CALL(vnode, access))
2191			status = FS_CALL(vnode.Get(), access, X_OK);
2192
2193		// Tell the filesystem to get the vnode of this path component (if we
2194		// got the permission from the call above)
2195		VnodePutter nextVnode;
2196		if (status == B_OK) {
2197			struct vnode* temp = NULL;
2198			status = lookup_dir_entry(vnode.Get(), path, &temp);
2199			nextVnode.SetTo(temp);
2200		}
2201
2202		if (status != B_OK) {
2203			if (leafName != NULL) {
2204				strlcpy(leafName, path, B_FILE_NAME_LENGTH);
2205				_vnode.SetTo(vnode.Detach());
2206			}
2207			return status;
2208		}
2209
2210		// If the new node is a symbolic link, resolve it (if we've been told
2211		// to do it)
2212		if (S_ISLNK(nextVnode->Type())
2213			&& (traverseLeafLink || directoryFound)) {
2214			size_t bufferSize;
2215			char* buffer;
2216
2217			TRACE(("traverse link\n"));
2218
2219			if (count + 1 > B_MAX_SYMLINKS)
2220				return B_LINK_LIMIT;
2221
2222			bufferSize = B_PATH_NAME_LENGTH;
2223			buffer = (char*)object_cache_alloc(sPathNameCache, 0);
2224			if (buffer == NULL)
2225				return B_NO_MEMORY;
2226
2227			if (HAS_FS_CALL(nextVnode, read_symlink)) {
2228				bufferSize--;
2229				status = FS_CALL(nextVnode.Get(), read_symlink, buffer, &bufferSize);
2230				// null-terminate
2231				if (status >= 0 && bufferSize < B_PATH_NAME_LENGTH)
2232					buffer[bufferSize] = '\0';
2233			} else
2234				status = B_BAD_VALUE;
2235
2236			if (status != B_OK) {
2237				free(buffer);
2238				return status;
2239			}
2240			nextVnode.Unset();
2241
2242			// Check if we start from the root directory or the current
2243			// directory ("vnode" still points to that one).
2244			// Cut off all leading slashes if it's the root directory
2245			path = buffer;
2246			bool absoluteSymlink = false;
2247			if (path[0] == '/') {
2248				// we don't need the old directory anymore
2249				vnode.Unset();
2250
2251				while (*++path == '/')
2252					;
2253
2254				mutex_lock(&sIOContextRootLock);
2255				vnode.SetTo(ioContext->root);
2256				inc_vnode_ref_count(vnode.Get());
2257				mutex_unlock(&sIOContextRootLock);
2258
2259				absoluteSymlink = true;
2260			}
2261
2262			inc_vnode_ref_count(vnode.Get());
2263				// balance the next recursion - we will decrement the
2264				// ref_count of the vnode, no matter if we succeeded or not
2265
2266			if (absoluteSymlink && *path == '\0') {
2267				// symlink was just "/"
2268				nextVnode.SetTo(vnode.Get());
2269			} else {
2270				status = vnode_path_to_vnode(vnode.Get(), path, true, count + 1,
2271					ioContext, nextVnode, &lastParentID, leafName);
2272			}
2273
2274			object_cache_free(sPathNameCache, buffer, 0);
2275
2276			if (status != B_OK) {
2277				if (leafName != NULL)
2278					_vnode.SetTo(nextVnode.Detach());
2279				return status;
2280			}
2281		} else
2282			lastParentID = vnode->id;
2283
2284		// decrease the ref count on the old dir we just looked up into
2285		vnode.Unset();
2286
2287		path = nextPath;
2288		vnode.SetTo(nextVnode.Detach());
2289
2290		// see if we hit a covered node
2291		if (Vnode* coveringNode = get_covering_vnode(vnode.Get()))
2292			vnode.SetTo(coveringNode);
2293	}
2294
2295	_vnode.SetTo(vnode.Detach());
2296	if (_parentID)
2297		*_parentID = lastParentID;
2298
2299	return B_OK;
2300}
2301
2302
2303static status_t
2304vnode_path_to_vnode(struct vnode* vnode, char* path, bool traverseLeafLink,
2305	bool kernel, VnodePutter& _vnode, ino_t* _parentID, char* leafName)
2306{
2307	return vnode_path_to_vnode(vnode, path, traverseLeafLink, 0,
2308		get_current_io_context(kernel), _vnode, _parentID, leafName);
2309}
2310
2311
2312static status_t
2313path_to_vnode(char* path, bool traverseLink, VnodePutter& _vnode,
2314	ino_t* _parentID, bool kernel)
2315{
2316	struct vnode* start = NULL;
2317
2318	FUNCTION(("path_to_vnode(path = \"%s\")\n", path));
2319
2320	if (!path)
2321		return B_BAD_VALUE;
2322
2323	if (*path == '\0')
2324		return B_ENTRY_NOT_FOUND;
2325
2326	// figure out if we need to start at root or at cwd
2327	if (*path == '/') {
2328		if (sRoot == NULL) {
2329			// we're a bit early, aren't we?
2330			return B_ERROR;
2331		}
2332
2333		while (*++path == '/')
2334			;
2335		start = get_root_vnode(kernel);
2336
2337		if (*path == '\0') {
2338			_vnode.SetTo(start);
2339			return B_OK;
2340		}
2341
2342	} else {
2343		struct io_context* context = get_current_io_context(kernel);
2344
2345		mutex_lock(&context->io_mutex);
2346		start = context->cwd;
2347		if (start != NULL)
2348			inc_vnode_ref_count(start);
2349		mutex_unlock(&context->io_mutex);
2350
2351		if (start == NULL)
2352			return B_ERROR;
2353	}
2354
2355	return vnode_path_to_vnode(start, path, traverseLink, kernel, _vnode,
2356		_parentID);
2357}
2358
2359
2360/*! Returns the vnode in the next to last segment of the path, and returns
2361	the last portion in filename.
2362	The path buffer must be able to store at least one additional character.
2363*/
2364static status_t
2365path_to_dir_vnode(char* path, VnodePutter& _vnode, char* filename,
2366	bool kernel)
2367{
2368	status_t status = get_dir_path_and_leaf(path, filename);
2369	if (status != B_OK)
2370		return status;
2371
2372	return path_to_vnode(path, true, _vnode, NULL, kernel);
2373}
2374
2375
2376/*!	\brief Retrieves the directory vnode and the leaf name of an entry referred
2377		   to by a FD + path pair.
2378
2379	\a path must be given in either case. \a fd might be omitted, in which
2380	case \a path is either an absolute path or one relative to the current
2381	directory. If both a supplied and \a path is relative it is reckoned off
2382	of the directory referred to by \a fd. If \a path is absolute \a fd is
2383	ignored.
2384
2385	The caller has the responsibility to call put_vnode() on the returned
2386	directory vnode.
2387
2388	\param fd The FD. May be < 0.
2389	\param path The absolute or relative path. Must not be \c NULL. The buffer
2390	       is modified by this function. It must have at least room for a
2391	       string one character longer than the path it contains.
2392	\param _vnode A pointer to a variable the directory vnode shall be written
2393		   into.
2394	\param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2395		   the leaf name of the specified entry will be written.
2396	\param kernel \c true, if invoked from inside the kernel, \c false if
2397		   invoked from userland.
2398	\return \c B_OK, if everything went fine, another error code otherwise.
2399*/
2400static status_t
2401fd_and_path_to_dir_vnode(int fd, char* path, VnodePutter& _vnode,
2402	char* filename, bool kernel)
2403{
2404	if (!path)
2405		return B_BAD_VALUE;
2406	if (*path == '\0')
2407		return B_ENTRY_NOT_FOUND;
2408	if (fd < 0)
2409		return path_to_dir_vnode(path, _vnode, filename, kernel);
2410
2411	status_t status = get_dir_path_and_leaf(path, filename);
2412	if (status != B_OK)
2413		return status;
2414
2415	return fd_and_path_to_vnode(fd, path, true, _vnode, NULL, kernel);
2416}
2417
2418
2419/*!	\brief Retrieves the directory vnode and the leaf name of an entry referred
2420		   to by a vnode + path pair.
2421
2422	\a path must be given in either case. \a vnode might be omitted, in which
2423	case \a path is either an absolute path or one relative to the current
2424	directory. If both a supplied and \a path is relative it is reckoned off
2425	of the directory referred to by \a vnode. If \a path is absolute \a vnode is
2426	ignored.
2427
2428	The caller has the responsibility to call put_vnode() on the returned
2429	directory vnode.
2430
2431	Note, this reduces the ref_count of the starting \a vnode, no matter if
2432	it is successful or not.
2433
2434	\param vnode The vnode. May be \c NULL.
2435	\param path The absolute or relative path. Must not be \c NULL. The buffer
2436	       is modified by this function. It must have at least room for a
2437	       string one character longer than the path it contains.
2438	\param _vnode A pointer to a variable the directory vnode shall be written
2439		   into.
2440	\param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2441		   the leaf name of the specified entry will be written.
2442	\param kernel \c true, if invoked from inside the kernel, \c false if
2443		   invoked from userland.
2444	\return \c B_OK, if everything went fine, another error code otherwise.
2445*/
2446static status_t
2447vnode_and_path_to_dir_vnode(struct vnode* vnode, char* path,
2448	VnodePutter& _vnode, char* filename, bool kernel)
2449{
2450	VnodePutter vnodePutter(vnode);
2451
2452	if (!path)
2453		return B_BAD_VALUE;
2454	if (*path == '\0')
2455		return B_ENTRY_NOT_FOUND;
2456	if (vnode == NULL || path[0] == '/')
2457		return path_to_dir_vnode(path, _vnode, filename, kernel);
2458
2459	status_t status = get_dir_path_and_leaf(path, filename);
2460	if (status != B_OK)
2461		return status;
2462
2463	vnodePutter.Detach();
2464	return vnode_path_to_vnode(vnode, path, true, kernel, _vnode, NULL);
2465}
2466
2467
2468/*! Returns a vnode's name in the d_name field of a supplied dirent buffer.
2469*/
2470static status_t
2471get_vnode_name(struct vnode* vnode, struct vnode* parent, struct dirent* buffer,
2472	size_t bufferSize, struct io_context* ioContext)
2473{
2474	if (bufferSize < sizeof(struct dirent))
2475		return B_BAD_VALUE;
2476
2477	// See if the vnode is covering another vnode and move to the covered
2478	// vnode so we get the underlying file system
2479	VnodePutter vnodePutter;
2480	if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2481		vnode = coveredVnode;
2482		vnodePutter.SetTo(vnode);
2483	}
2484
2485	if (HAS_FS_CALL(vnode, get_vnode_name)) {
2486		// The FS supports getting the name of a vnode.
2487		if (FS_CALL(vnode, get_vnode_name, buffer->d_name,
2488			(char*)buffer + bufferSize - buffer->d_name) == B_OK)
2489			return B_OK;
2490	}
2491
2492	// The FS doesn't support getting the name of a vnode. So we search the
2493	// parent directory for the vnode, if the caller let us.
2494
2495	if (parent == NULL || !HAS_FS_CALL(parent, read_dir))
2496		return B_UNSUPPORTED;
2497
2498	void* cookie;
2499
2500	status_t status = FS_CALL(parent, open_dir, &cookie);
2501	if (status >= B_OK) {
2502		while (true) {
2503			uint32 num = 1;
2504			// We use the FS hook directly instead of dir_read(), since we don't
2505			// want the entries to be fixed. We have already resolved vnode to
2506			// the covered node.
2507			status = FS_CALL(parent, read_dir, cookie, buffer, bufferSize,
2508				&num);
2509			if (status != B_OK)
2510				break;
2511			if (num == 0) {
2512				status = B_ENTRY_NOT_FOUND;
2513				break;
2514			}
2515
2516			if (vnode->id == buffer->d_ino) {
2517				// found correct entry!
2518				break;
2519			}
2520		}
2521
2522		FS_CALL(parent, close_dir, cookie);
2523		FS_CALL(parent, free_dir_cookie, cookie);
2524	}
2525	return status;
2526}
2527
2528
2529static status_t
2530get_vnode_name(struct vnode* vnode, struct vnode* parent, char* name,
2531	size_t nameSize, bool kernel)
2532{
2533	char buffer[offsetof(struct dirent, d_name) + B_FILE_NAME_LENGTH + 1];
2534	struct dirent* dirent = (struct dirent*)buffer;
2535
2536	status_t status = get_vnode_name(vnode, parent, dirent, sizeof(buffer),
2537		get_current_io_context(kernel));
2538	if (status != B_OK)
2539		return status;
2540
2541	if (strlcpy(name, dirent->d_name, nameSize) >= nameSize)
2542		return B_BUFFER_OVERFLOW;
2543
2544	return B_OK;
2545}
2546
2547
2548/*!	Gets the full path to a given directory vnode.
2549	It uses the fs_get_vnode_name() call to get the name of a vnode; if a
2550	file system doesn't support this call, it will fall back to iterating
2551	through the parent directory to get the name of the child.
2552
2553	To protect against circular loops, it supports a maximum tree depth
2554	of 256 levels.
2555
2556	Note that the path may not be correct the time this function returns!
2557	It doesn't use any locking to prevent returning the correct path, as
2558	paths aren't safe anyway: the path to a file can change at any time.
2559
2560	It might be a good idea, though, to check if the returned path exists
2561	in the calling function (it's not done here because of efficiency)
2562*/
2563static status_t
2564dir_vnode_to_path(struct vnode* vnode, char* buffer, size_t bufferSize,
2565	bool kernel)
2566{
2567	FUNCTION(("dir_vnode_to_path(%p, %p, %lu)\n", vnode, buffer, bufferSize));
2568
2569	if (vnode == NULL || buffer == NULL || bufferSize == 0)
2570		return B_BAD_VALUE;
2571
2572	if (!S_ISDIR(vnode->Type()))
2573		return B_NOT_A_DIRECTORY;
2574
2575	char* path = buffer;
2576	int32 insert = bufferSize;
2577	int32 maxLevel = 256;
2578	int32 length;
2579	status_t status = B_OK;
2580	struct io_context* ioContext = get_current_io_context(kernel);
2581
2582	// we don't use get_vnode() here because this call is more
2583	// efficient and does all we need from get_vnode()
2584	inc_vnode_ref_count(vnode);
2585
2586	path[--insert] = '\0';
2587		// the path is filled right to left
2588
2589	while (true) {
2590		// If the node is the context's root, bail out. Otherwise resolve mount
2591		// points.
2592		if (vnode == ioContext->root)
2593			break;
2594
2595		if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2596			put_vnode(vnode);
2597			vnode = coveredVnode;
2598		}
2599
2600		// lookup the parent vnode
2601		struct vnode* parentVnode;
2602		status = lookup_dir_entry(vnode, "..", &parentVnode);
2603		if (status != B_OK)
2604			goto out;
2605
2606		if (parentVnode == vnode) {
2607			// The caller apparently got their hands on a node outside of their
2608			// context's root. Now we've hit the global root.
2609			put_vnode(parentVnode);
2610			break;
2611		}
2612
2613		// get the node's name
2614		char nameBuffer[offsetof(struct dirent, d_name) + B_FILE_NAME_LENGTH + 1];
2615			// also used for fs_read_dir()
2616		char* name = &((struct dirent*)nameBuffer)->d_name[0];
2617		status = get_vnode_name(vnode, parentVnode, (struct dirent*)nameBuffer,
2618			sizeof(nameBuffer), ioContext);
2619
2620		// release the current vnode, we only need its parent from now on
2621		put_vnode(vnode);
2622		vnode = parentVnode;
2623
2624		if (status != B_OK)
2625			goto out;
2626
2627		// TODO: add an explicit check for loops in about 10 levels to do
2628		// real loop detection
2629
2630		// don't go deeper as 'maxLevel' to prevent circular loops
2631		if (maxLevel-- < 0) {
2632			status = B_LINK_LIMIT;
2633			goto out;
2634		}
2635
2636		// add the name in front of the current path
2637		name[B_FILE_NAME_LENGTH - 1] = '\0';
2638		length = strlen(name);
2639		insert -= length;
2640		if (insert <= 0) {
2641			status = B_RESULT_NOT_REPRESENTABLE;
2642			goto out;
2643		}
2644		memcpy(path + insert, name, length);
2645		path[--insert] = '/';
2646	}
2647
2648	// the root dir will result in an empty path: fix it
2649	if (path[insert] == '\0')
2650		path[--insert] = '/';
2651
2652	TRACE(("  path is: %s\n", path + insert));
2653
2654	// move the path to the start of the buffer
2655	length = bufferSize - insert;
2656	memmove(buffer, path + insert, length);
2657
2658out:
2659	put_vnode(vnode);
2660	return status;
2661}
2662
2663
2664/*!	Checks the length of every path component, and adds a '.'
2665	if the path ends in a slash.
2666	The given path buffer must be able to store at least one
2667	additional character.
2668*/
2669static status_t
2670check_path(char* to)
2671{
2672	int32 length = 0;
2673
2674	// check length of every path component
2675
2676	while (*to) {
2677		char* begin;
2678		if (*to == '/')
2679			to++, length++;
2680
2681		begin = to;
2682		while (*to != '/' && *to)
2683			to++, length++;
2684
2685		if (to - begin > B_FILE_NAME_LENGTH)
2686			return B_NAME_TOO_LONG;
2687	}
2688
2689	if (length == 0)
2690		return B_ENTRY_NOT_FOUND;
2691
2692	// complete path if there is a slash at the end
2693
2694	if (*(to - 1) == '/') {
2695		if (length > B_PATH_NAME_LENGTH - 2)
2696			return B_NAME_TOO_LONG;
2697
2698		to[0] = '.';
2699		to[1] = '\0';
2700	}
2701
2702	return B_OK;
2703}
2704
2705
2706static struct file_descriptor*
2707get_fd_and_vnode(int fd, struct vnode** _vnode, bool kernel)
2708{
2709	struct file_descriptor* descriptor
2710		= get_fd(get_current_io_context(kernel), fd);
2711	if (descriptor == NULL)
2712		return NULL;
2713
2714	struct vnode* vnode = fd_vnode(descriptor);
2715	if (vnode == NULL) {
2716		put_fd(descriptor);
2717		return NULL;
2718	}
2719
2720	// ToDo: when we can close a file descriptor at any point, investigate
2721	//	if this is still valid to do (accessing the vnode without ref_count
2722	//	or locking)
2723	*_vnode = vnode;
2724	return descriptor;
2725}
2726
2727
2728static struct vnode*
2729get_vnode_from_fd(int fd, bool kernel)
2730{
2731	struct file_descriptor* descriptor;
2732	struct vnode* vnode;
2733
2734	descriptor = get_fd(get_current_io_context(kernel), fd);
2735	if (descriptor == NULL)
2736		return NULL;
2737
2738	vnode = fd_vnode(descriptor);
2739	if (vnode != NULL)
2740		inc_vnode_ref_count(vnode);
2741
2742	put_fd(descriptor);
2743	return vnode;
2744}
2745
2746
2747/*!	Gets the vnode from an FD + path combination. If \a fd is lower than zero,
2748	only the path will be considered. In this case, the \a path must not be
2749	NULL.
2750	If \a fd is a valid file descriptor, \a path may be NULL for directories,
2751	and should be NULL for files.
2752*/
2753static status_t
2754fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
2755	VnodePutter& _vnode, ino_t* _parentID, bool kernel)
2756{
2757	if (fd < 0 && !path)
2758		return B_BAD_VALUE;
2759
2760	if (path != NULL && *path == '\0')
2761		return B_ENTRY_NOT_FOUND;
2762
2763	if (fd < 0 || (path != NULL && path[0] == '/')) {
2764		// no FD or absolute path
2765		return path_to_vnode(path, traverseLeafLink, _vnode, _parentID, kernel);
2766	}
2767
2768	// FD only, or FD + relative path
2769	struct vnode* vnode = get_vnode_from_fd(fd, kernel);
2770	if (vnode == NULL)
2771		return B_FILE_ERROR;
2772
2773	if (path != NULL) {
2774		return vnode_path_to_vnode(vnode, path, traverseLeafLink, kernel,
2775			_vnode, _parentID);
2776	}
2777
2778	// there is no relative path to take into account
2779
2780	_vnode.SetTo(vnode);
2781	if (_parentID)
2782		*_parentID = -1;
2783
2784	return B_OK;
2785}
2786
2787
2788static int
2789get_new_fd(int type, struct fs_mount* mount, struct vnode* vnode,
2790	void* cookie, int openMode, bool kernel)
2791{
2792	struct file_descriptor* descriptor;
2793	int fd;
2794
2795	// If the vnode is locked, we don't allow creating a new file/directory
2796	// file_descriptor for it
2797	if (vnode && vnode->mandatory_locked_by != NULL
2798		&& (type == FDTYPE_FILE || type == FDTYPE_DIR))
2799		return B_BUSY;
2800
2801	if ((openMode & O_RDWR) != 0 && (openMode & O_WRONLY) != 0)
2802		return B_BAD_VALUE;
2803
2804	descriptor = alloc_fd();
2805	if (!descriptor)
2806		return B_NO_MEMORY;
2807
2808	if (vnode)
2809		descriptor->u.vnode = vnode;
2810	else
2811		descriptor->u.mount = mount;
2812	descriptor->cookie = cookie;
2813
2814	switch (type) {
2815		// vnode types
2816		case FDTYPE_FILE:
2817			descriptor->ops = &sFileOps;
2818			break;
2819		case FDTYPE_DIR:
2820			descriptor->ops = &sDirectoryOps;
2821			break;
2822		case FDTYPE_ATTR:
2823			descriptor->ops = &sAttributeOps;
2824			break;
2825		case FDTYPE_ATTR_DIR:
2826			descriptor->ops = &sAttributeDirectoryOps;
2827			break;
2828
2829		// mount types
2830		case FDTYPE_INDEX_DIR:
2831			descriptor->ops = &sIndexDirectoryOps;
2832			break;
2833		case FDTYPE_QUERY:
2834			descriptor->ops = &sQueryOps;
2835			break;
2836
2837		default:
2838			panic("get_new_fd() called with unknown type %d\n", type);
2839			break;
2840	}
2841	descriptor->type = type;
2842	descriptor->open_mode = openMode;
2843
2844	if (descriptor->ops->fd_seek != NULL) {
2845		// some kinds of files are not seekable
2846		switch (vnode->Type() & S_IFMT) {
2847			case S_IFIFO:
2848			case S_IFSOCK:
2849				ASSERT(descriptor->pos == -1);
2850				break;
2851
2852			// The Open Group Base Specs don't mention any file types besides pipes,
2853			// FIFOs, and sockets specially, so we allow seeking all others.
2854			default:
2855				descriptor->pos = 0;
2856				break;
2857		}
2858	}
2859
2860	io_context* context = get_current_io_context(kernel);
2861	fd = new_fd(context, descriptor);
2862	if (fd < 0) {
2863		descriptor->ops = NULL;
2864		put_fd(descriptor);
2865		return B_NO_MORE_FDS;
2866	}
2867
2868	mutex_lock(&context->io_mutex);
2869	fd_set_close_on_exec(context, fd, (openMode & O_CLOEXEC) != 0);
2870	mutex_unlock(&context->io_mutex);
2871
2872	return fd;
2873}
2874
2875
2876/*!	In-place normalizes \a path. It's otherwise semantically equivalent to
2877	vfs_normalize_path(). See there for more documentation.
2878*/
2879static status_t
2880normalize_path(char* path, size_t pathSize, bool traverseLink, bool kernel)
2881{
2882	VnodePutter dir;
2883	status_t error;
2884
2885	for (int i = 0; i < B_MAX_SYMLINKS; i++) {
2886		// get dir vnode + leaf name
2887		char leaf[B_FILE_NAME_LENGTH];
2888		error = vnode_and_path_to_dir_vnode(dir.Detach(), path, dir, leaf, kernel);
2889		if (error != B_OK)
2890			return error;
2891		strcpy(path, leaf);
2892
2893		// get file vnode, if we shall resolve links
2894		bool fileExists = false;
2895		VnodePutter fileVnode;
2896		if (traverseLink) {
2897			inc_vnode_ref_count(dir.Get());
2898			if (vnode_path_to_vnode(dir.Get(), path, false, kernel, fileVnode,
2899					NULL) == B_OK) {
2900				fileExists = true;
2901			}
2902		}
2903
2904		if (!fileExists || !traverseLink || !S_ISLNK(fileVnode->Type())) {
2905			// we're done -- construct the path
2906			bool hasLeaf = true;
2907			if (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0) {
2908				// special cases "." and ".." -- get the dir, forget the leaf
2909				error = vnode_path_to_vnode(dir.Detach(), leaf, false, kernel,
2910					dir, NULL);
2911				if (error != B_OK)
2912					return error;
2913				hasLeaf = false;
2914			}
2915
2916			// get the directory path
2917			error = dir_vnode_to_path(dir.Get(), path, B_PATH_NAME_LENGTH, kernel);
2918			if (error != B_OK)
2919				return error;
2920
2921			// append the leaf name
2922			if (hasLeaf) {
2923				// insert a directory separator if this is not the file system
2924				// root
2925				if ((strcmp(path, "/") != 0
2926					&& strlcat(path, "/", pathSize) >= pathSize)
2927					|| strlcat(path, leaf, pathSize) >= pathSize) {
2928					return B_NAME_TOO_LONG;
2929				}
2930			}
2931
2932			return B_OK;
2933		}
2934
2935		// read link
2936		if (HAS_FS_CALL(fileVnode, read_symlink)) {
2937			size_t bufferSize = B_PATH_NAME_LENGTH - 1;
2938			error = FS_CALL(fileVnode.Get(), read_symlink, path, &bufferSize);
2939			if (error != B_OK)
2940				return error;
2941			if (bufferSize < B_PATH_NAME_LENGTH)
2942				path[bufferSize] = '\0';
2943		} else
2944			return B_BAD_VALUE;
2945	}
2946
2947	return B_LINK_LIMIT;
2948}
2949
2950
2951static status_t
2952resolve_covered_parent(struct vnode* parent, dev_t* _device, ino_t* _node,
2953	struct io_context* ioContext)
2954{
2955	// Make sure the IO context root is not bypassed.
2956	if (parent == ioContext->root) {
2957		*_device = parent->device;
2958		*_node = parent->id;
2959		return B_OK;
2960	}
2961
2962	inc_vnode_ref_count(parent);
2963		// vnode_path_to_vnode() puts the node
2964
2965	// ".." is guaranteed not to be clobbered by this call
2966	VnodePutter vnode;
2967	status_t status = vnode_path_to_vnode(parent, (char*)"..", false,
2968		ioContext, vnode, NULL);
2969	if (status == B_OK) {
2970		*_device = vnode->device;
2971		*_node = vnode->id;
2972	}
2973
2974	return status;
2975}
2976
2977
2978#ifdef ADD_DEBUGGER_COMMANDS
2979
2980
2981static void
2982_dump_advisory_locking(advisory_locking* locking)
2983{
2984	if (locking == NULL)
2985		return;
2986
2987	kprintf("   lock:        %" B_PRId32, locking->lock);
2988	kprintf("   wait_sem:    %" B_PRId32, locking->wait_sem);
2989
2990	int32 index = 0;
2991	LockList::Iterator iterator = locking->locks.GetIterator();
2992	while (iterator.HasNext()) {
2993		struct advisory_lock* lock = iterator.Next();
2994
2995		kprintf("   [%2" B_PRId32 "] team:   %" B_PRId32 "\n", index++, lock->team);
2996		kprintf("        start:  %" B_PRIdOFF "\n", lock->start);
2997		kprintf("        end:    %" B_PRIdOFF "\n", lock->end);
2998		kprintf("        shared? %s\n", lock->shared ? "yes" : "no");
2999	}
3000}
3001
3002
3003static void
3004_dump_mount(struct fs_mount* mount)
3005{
3006	kprintf("MOUNT: %p\n", mount);
3007	kprintf(" id:            %" B_PRIdDEV "\n", mount->id);
3008	kprintf(" device_name:   %s\n", mount->device_name);
3009	kprintf(" root_vnode:    %p\n", mount->root_vnode);
3010	kprintf(" covers:        %p\n", mount->root_vnode->covers);
3011	kprintf(" partition:     %p\n", mount->partition);
3012	kprintf(" lock:          %p\n", &mount->lock);
3013	kprintf(" flags:        %s%s\n", mount->unmounting ? " unmounting" : "",
3014		mount->owns_file_device ? " owns_file_device" : "");
3015
3016	fs_volume* volume = mount->volume;
3017	while (volume != NULL) {
3018		kprintf(" volume %p:\n", volume);
3019		kprintf("  layer:            %" B_PRId32 "\n", volume->layer);
3020		kprintf("  private_volume:   %p\n", volume->private_volume);
3021		kprintf("  ops:              %p\n", volume->ops);
3022		kprintf("  file_system:      %p\n", volume->file_system);
3023		kprintf("  file_system_name: %s\n", volume->file_system_name);
3024		volume = volume->super_volume;
3025	}
3026
3027	set_debug_variable("_volume", (addr_t)mount->volume->private_volume);
3028	set_debug_variable("_root", (addr_t)mount->root_vnode);
3029	set_debug_variable("_covers", (addr_t)mount->root_vnode->covers);
3030	set_debug_variable("_partition", (addr_t)mount->partition);
3031}
3032
3033
3034static bool
3035debug_prepend_vnode_name_to_path(char* buffer, size_t& bufferSize,
3036	const char* name)
3037{
3038	bool insertSlash = buffer[bufferSize] != '\0';
3039	size_t nameLength = strlen(name);
3040
3041	if (bufferSize < nameLength + (insertSlash ? 1 : 0))
3042		return false;
3043
3044	if (insertSlash)
3045		buffer[--bufferSize] = '/';
3046
3047	bufferSize -= nameLength;
3048	memcpy(buffer + bufferSize, name, nameLength);
3049
3050	return true;
3051}
3052
3053
3054static bool
3055debug_prepend_vnode_id_to_path(char* buffer, size_t& bufferSize, dev_t devID,
3056	ino_t nodeID)
3057{
3058	if (bufferSize == 0)
3059		return false;
3060
3061	bool insertSlash = buffer[bufferSize] != '\0';
3062	if (insertSlash)
3063		buffer[--bufferSize] = '/';
3064
3065	size_t size = snprintf(buffer, bufferSize,
3066		"<%" B_PRIdDEV ",%" B_PRIdINO ">", devID, nodeID);
3067	if (size > bufferSize) {
3068		if (insertSlash)
3069			bufferSize++;
3070		return false;
3071	}
3072
3073	if (size < bufferSize)
3074		memmove(buffer + bufferSize - size, buffer, size);
3075
3076	bufferSize -= size;
3077	return true;
3078}
3079
3080
3081static char*
3082debug_resolve_vnode_path(struct vnode* vnode, char* buffer, size_t bufferSize,
3083	bool& _truncated)
3084{
3085	// null-terminate the path
3086	buffer[--bufferSize] = '\0';
3087
3088	while (true) {
3089		while (vnode->covers != NULL)
3090			vnode = vnode->covers;
3091
3092		if (vnode == sRoot) {
3093			_truncated = bufferSize == 0;
3094			if (!_truncated)
3095				buffer[--bufferSize] = '/';
3096			return buffer + bufferSize;
3097		}
3098
3099		// resolve the name
3100		ino_t dirID;
3101		const char* name = vnode->mount->entry_cache.DebugReverseLookup(
3102			vnode->id, dirID);
3103		if (name == NULL) {
3104			// Failed to resolve the name -- prepend "<dev,node>/".
3105			_truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3106				vnode->mount->id, vnode->id);
3107			return buffer + bufferSize;
3108		}
3109
3110		// prepend the name
3111		if (!debug_prepend_vnode_name_to_path(buffer, bufferSize, name)) {
3112			_truncated = true;
3113			return buffer + bufferSize;
3114		}
3115
3116		// resolve the directory node
3117		struct vnode* nextVnode = lookup_vnode(vnode->mount->id, dirID);
3118		if (nextVnode == NULL) {
3119			_truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3120				vnode->mount->id, dirID);
3121			return buffer + bufferSize;
3122		}
3123
3124		vnode = nextVnode;
3125	}
3126}
3127
3128
3129static void
3130_dump_vnode(struct vnode* vnode, bool printPath)
3131{
3132	kprintf("VNODE: %p\n", vnode);
3133	kprintf(" device:        %" B_PRIdDEV "\n", vnode->device);
3134	kprintf(" id:            %" B_PRIdINO "\n", vnode->id);
3135	kprintf(" ref_count:     %" B_PRId32 "\n", vnode->ref_count);
3136	kprintf(" private_node:  %p\n", vnode->private_node);
3137	kprintf(" mount:         %p\n", vnode->mount);
3138	kprintf(" covered_by:    %p\n", vnode->covered_by);
3139	kprintf(" covers:        %p\n", vnode->covers);
3140	kprintf(" cache:         %p\n", vnode->cache);
3141	kprintf(" type:          %#" B_PRIx32 "\n", vnode->Type());
3142	kprintf(" flags:         %s%s%s\n", vnode->IsRemoved() ? "r" : "-",
3143		vnode->IsBusy() ? "b" : "-", vnode->IsUnpublished() ? "u" : "-");
3144	kprintf(" advisory_lock: %p\n", vnode->advisory_locking);
3145
3146	_dump_advisory_locking(vnode->advisory_locking);
3147
3148	if (printPath) {
3149		void* buffer = debug_malloc(B_PATH_NAME_LENGTH);
3150		if (buffer != NULL) {
3151			bool truncated;
3152			char* path = debug_resolve_vnode_path(vnode, (char*)buffer,
3153				B_PATH_NAME_LENGTH, truncated);
3154			if (path != NULL) {
3155				kprintf(" path:          ");
3156				if (truncated)
3157					kputs("<truncated>/");
3158				kputs(path);
3159				kputs("\n");
3160			} else
3161				kprintf("Failed to resolve vnode path.\n");
3162
3163			debug_free(buffer);
3164		} else
3165			kprintf("Failed to allocate memory for constructing the path.\n");
3166	}
3167
3168	set_debug_variable("_node", (addr_t)vnode->private_node);
3169	set_debug_variable("_mount", (addr_t)vnode->mount);
3170	set_debug_variable("_covered_by", (addr_t)vnode->covered_by);
3171	set_debug_variable("_covers", (addr_t)vnode->covers);
3172	set_debug_variable("_adv_lock", (addr_t)vnode->advisory_locking);
3173}
3174
3175
3176static int
3177dump_mount(int argc, char** argv)
3178{
3179	if (argc != 2 || !strcmp(argv[1], "--help")) {
3180		kprintf("usage: %s [id|address]\n", argv[0]);
3181		return 0;
3182	}
3183
3184	ulong val = parse_expression(argv[1]);
3185	uint32 id = val;
3186
3187	struct fs_mount* mount = sMountsTable->Lookup(id);
3188	if (mount == NULL) {
3189		if (IS_USER_ADDRESS(id)) {
3190			kprintf("fs_mount not found\n");
3191			return 0;
3192		}
3193		mount = (fs_mount*)val;
3194	}
3195
3196	_dump_mount(mount);
3197	return 0;
3198}
3199
3200
3201static int
3202dump_mounts(int argc, char** argv)
3203{
3204	if (argc != 1) {
3205		kprintf("usage: %s\n", argv[0]);
3206		return 0;
3207	}
3208
3209	kprintf("%-*s    id %-*s   %-*s   %-*s   fs_name\n",
3210		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "root",
3211		B_PRINTF_POINTER_WIDTH, "covers", B_PRINTF_POINTER_WIDTH, "cookie");
3212
3213	struct fs_mount* mount;
3214
3215	MountTable::Iterator iterator(sMountsTable);
3216	while (iterator.HasNext()) {
3217		mount = iterator.Next();
3218		kprintf("%p%4" B_PRIdDEV " %p %p %p %s\n", mount, mount->id, mount->root_vnode,
3219			mount->root_vnode->covers, mount->volume->private_volume,
3220			mount->volume->file_system_name);
3221
3222		fs_volume* volume = mount->volume;
3223		while (volume->super_volume != NULL) {
3224			volume = volume->super_volume;
3225			kprintf("                                     %p %s\n",
3226				volume->private_volume, volume->file_system_name);
3227		}
3228	}
3229
3230	return 0;
3231}
3232
3233
3234static int
3235dump_vnode(int argc, char** argv)
3236{
3237	bool printPath = false;
3238	int argi = 1;
3239	if (argc >= 2 && strcmp(argv[argi], "-p") == 0) {
3240		printPath = true;
3241		argi++;
3242	}
3243
3244	if (argi >= argc || argi + 2 < argc) {
3245		print_debugger_command_usage(argv[0]);
3246		return 0;
3247	}
3248
3249	struct vnode* vnode = NULL;
3250
3251	if (argi + 1 == argc) {
3252		vnode = (struct vnode*)parse_expression(argv[argi]);
3253		if (IS_USER_ADDRESS(vnode)) {
3254			kprintf("invalid vnode address\n");
3255			return 0;
3256		}
3257		_dump_vnode(vnode, printPath);
3258		return 0;
3259	}
3260
3261	dev_t device = parse_expression(argv[argi]);
3262	ino_t id = parse_expression(argv[argi + 1]);
3263
3264	VnodeTable::Iterator iterator(sVnodeTable);
3265	while (iterator.HasNext()) {
3266		vnode = iterator.Next();
3267		if (vnode->id != id || vnode->device != device)
3268			continue;
3269
3270		_dump_vnode(vnode, printPath);
3271	}
3272
3273	return 0;
3274}
3275
3276
3277static int
3278dump_vnodes(int argc, char** argv)
3279{
3280	if (argc != 2 || !strcmp(argv[1], "--help")) {
3281		kprintf("usage: %s [device]\n", argv[0]);
3282		return 0;
3283	}
3284
3285	// restrict dumped nodes to a certain device if requested
3286	dev_t device = parse_expression(argv[1]);
3287
3288	struct vnode* vnode;
3289
3290	kprintf("%-*s   dev     inode  ref %-*s   %-*s   %-*s   flags\n",
3291		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache",
3292		B_PRINTF_POINTER_WIDTH, "fs-node", B_PRINTF_POINTER_WIDTH, "locking");
3293
3294	VnodeTable::Iterator iterator(sVnodeTable);
3295	while (iterator.HasNext()) {
3296		vnode = iterator.Next();
3297		if (vnode->device != device)
3298			continue;
3299
3300		kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO "%5" B_PRId32 " %p %p %p %s%s%s\n",
3301			vnode, vnode->device, vnode->id, vnode->ref_count, vnode->cache,
3302			vnode->private_node, vnode->advisory_locking,
3303			vnode->IsRemoved() ? "r" : "-", vnode->IsBusy() ? "b" : "-",
3304			vnode->IsUnpublished() ? "u" : "-");
3305	}
3306
3307	return 0;
3308}
3309
3310
3311static int
3312dump_vnode_caches(int argc, char** argv)
3313{
3314	struct vnode* vnode;
3315
3316	if (argc > 2 || !strcmp(argv[1], "--help")) {
3317		kprintf("usage: %s [device]\n", argv[0]);
3318		return 0;
3319	}
3320
3321	// restrict dumped nodes to a certain device if requested
3322	dev_t device = -1;
3323	if (argc > 1)
3324		device = parse_expression(argv[1]);
3325
3326	kprintf("%-*s   dev     inode %-*s       size   pages\n",
3327		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache");
3328
3329	VnodeTable::Iterator iterator(sVnodeTable);
3330	while (iterator.HasNext()) {
3331		vnode = iterator.Next();
3332		if (vnode->cache == NULL)
3333			continue;
3334		if (device != -1 && vnode->device != device)
3335			continue;
3336
3337		kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO " %p %8" B_PRIdOFF "%8" B_PRId32 "\n",
3338			vnode, vnode->device, vnode->id, vnode->cache,
3339			(vnode->cache->virtual_end + B_PAGE_SIZE - 1) / B_PAGE_SIZE,
3340			vnode->cache->page_count);
3341	}
3342
3343	return 0;
3344}
3345
3346
3347int
3348dump_io_context(int argc, char** argv)
3349{
3350	if (argc > 2 || !strcmp(argv[1], "--help")) {
3351		kprintf("usage: %s [team-id|address]\n", argv[0]);
3352		return 0;
3353	}
3354
3355	struct io_context* context = NULL;
3356
3357	if (argc > 1) {
3358		ulong num = parse_expression(argv[1]);
3359		if (IS_KERNEL_ADDRESS(num))
3360			context = (struct io_context*)num;
3361		else {
3362			Team* team = team_get_team_struct_locked(num);
3363			if (team == NULL) {
3364				kprintf("could not find team with ID %lu\n", num);
3365				return 0;
3366			}
3367			context = (struct io_context*)team->io_context;
3368		}
3369	} else
3370		context = get_current_io_context(true);
3371
3372	kprintf("I/O CONTEXT: %p\n", context);
3373	kprintf(" root vnode:\t%p\n", context->root);
3374	kprintf(" cwd vnode:\t%p\n", context->cwd);
3375	kprintf(" used fds:\t%" B_PRIu32 "\n", context->num_used_fds);
3376	kprintf(" max fds:\t%" B_PRIu32 "\n", context->table_size);
3377
3378	if (context->num_used_fds) {
3379		kprintf("   no.  type    %*s  ref  open  mode         pos    %*s\n",
3380			B_PRINTF_POINTER_WIDTH, "ops", B_PRINTF_POINTER_WIDTH, "cookie");
3381	}
3382
3383	for (uint32 i = 0; i < context->table_size; i++) {
3384		struct file_descriptor* fd = context->fds[i];
3385		if (fd == NULL)
3386			continue;
3387
3388		kprintf("  %3" B_PRIu32 ":  %4" B_PRId32 "  %p  %3" B_PRId32 "  %4"
3389			B_PRIu32 "  %4" B_PRIx32 "  %10" B_PRIdOFF "  %p  %s %p\n", i,
3390			fd->type, fd->ops, fd->ref_count, fd->open_count, fd->open_mode,
3391			fd->pos, fd->cookie,
3392			fd->type >= FDTYPE_INDEX && fd->type <= FDTYPE_QUERY
3393				? "mount" : "vnode",
3394			fd->u.vnode);
3395	}
3396
3397	kprintf(" used monitors:\t%" B_PRIu32 "\n", context->num_monitors);
3398	kprintf(" max monitors:\t%" B_PRIu32 "\n", context->max_monitors);
3399
3400	set_debug_variable("_cwd", (addr_t)context->cwd);
3401
3402	return 0;
3403}
3404
3405
3406int
3407dump_vnode_usage(int argc, char** argv)
3408{
3409	if (argc != 1) {
3410		kprintf("usage: %s\n", argv[0]);
3411		return 0;
3412	}
3413
3414	kprintf("Unused vnodes: %" B_PRIu32 " (max unused %" B_PRIu32 ")\n",
3415		sUnusedVnodes, kMaxUnusedVnodes);
3416
3417	uint32 count = sVnodeTable->CountElements();
3418
3419	kprintf("%" B_PRIu32 " vnodes total (%" B_PRIu32 " in use).\n", count,
3420		count - sUnusedVnodes);
3421	return 0;
3422}
3423
3424#endif	// ADD_DEBUGGER_COMMANDS
3425
3426
3427/*!	Clears memory specified by an iovec array.
3428*/
3429static void
3430zero_iovecs(const iovec* vecs, size_t vecCount, size_t bytes)
3431{
3432	for (size_t i = 0; i < vecCount && bytes > 0; i++) {
3433		size_t length = std::min(vecs[i].iov_len, bytes);
3434		memset(vecs[i].iov_base, 0, length);
3435		bytes -= length;
3436	}
3437}
3438
3439
3440/*!	Does the dirty work of combining the file_io_vecs with the iovecs
3441	and calls the file system hooks to read/write the request to disk.
3442*/
3443static status_t
3444common_file_io_vec_pages(struct vnode* vnode, void* cookie,
3445	const file_io_vec* fileVecs, size_t fileVecCount, const iovec* vecs,
3446	size_t vecCount, uint32* _vecIndex, size_t* _vecOffset, size_t* _numBytes,
3447	bool doWrite)
3448{
3449	if (fileVecCount == 0) {
3450		// There are no file vecs at this offset, so we're obviously trying
3451		// to access the file outside of its bounds
3452		return B_BAD_VALUE;
3453	}
3454
3455	size_t numBytes = *_numBytes;
3456	uint32 fileVecIndex;
3457	size_t vecOffset = *_vecOffset;
3458	uint32 vecIndex = *_vecIndex;
3459	status_t status;
3460	size_t size;
3461
3462	if (!doWrite && vecOffset == 0) {
3463		// now directly read the data from the device
3464		// the first file_io_vec can be read directly
3465		// TODO: we could also write directly
3466
3467		if (fileVecs[0].length < (off_t)numBytes)
3468			size = fileVecs[0].length;
3469		else
3470			size = numBytes;
3471
3472		if (fileVecs[0].offset >= 0) {
3473			status = FS_CALL(vnode, read_pages, cookie, fileVecs[0].offset,
3474				&vecs[vecIndex], vecCount - vecIndex, &size);
3475		} else {
3476			// sparse read
3477			zero_iovecs(&vecs[vecIndex], vecCount - vecIndex, size);
3478			status = B_OK;
3479		}
3480		if (status != B_OK)
3481			return status;
3482
3483		ASSERT((off_t)size <= fileVecs[0].length);
3484
3485		// If the file portion was contiguous, we're already done now
3486		if (size == numBytes)
3487			return B_OK;
3488
3489		// if we reached the end of the file, we can return as well
3490		if ((off_t)size != fileVecs[0].length) {
3491			*_numBytes = size;
3492			return B_OK;
3493		}
3494
3495		fileVecIndex = 1;
3496
3497		// first, find out where we have to continue in our iovecs
3498		for (; vecIndex < vecCount; vecIndex++) {
3499			if (size < vecs[vecIndex].iov_len)
3500				break;
3501
3502			size -= vecs[vecIndex].iov_len;
3503		}
3504
3505		vecOffset = size;
3506	} else {
3507		fileVecIndex = 0;
3508		size = 0;
3509	}
3510
3511	// Too bad, let's process the rest of the file_io_vecs
3512
3513	size_t totalSize = size;
3514	size_t bytesLeft = numBytes - size;
3515
3516	for (; fileVecIndex < fileVecCount; fileVecIndex++) {
3517		const file_io_vec &fileVec = fileVecs[fileVecIndex];
3518		off_t fileOffset = fileVec.offset;
3519		off_t fileLeft = min_c(fileVec.length, (off_t)bytesLeft);
3520
3521		TRACE(("FILE VEC [%" B_PRIu32 "] length %" B_PRIdOFF "\n", fileVecIndex,
3522			fileLeft));
3523
3524		// process the complete fileVec
3525		while (fileLeft > 0) {
3526			iovec tempVecs[MAX_TEMP_IO_VECS];
3527			uint32 tempCount = 0;
3528
3529			// size tracks how much of what is left of the current fileVec
3530			// (fileLeft) has been assigned to tempVecs
3531			size = 0;
3532
3533			// assign what is left of the current fileVec to the tempVecs
3534			for (size = 0; (off_t)size < fileLeft && vecIndex < vecCount
3535					&& tempCount < MAX_TEMP_IO_VECS;) {
3536				// try to satisfy one iovec per iteration (or as much as
3537				// possible)
3538
3539				// bytes left of the current iovec
3540				size_t vecLeft = vecs[vecIndex].iov_len - vecOffset;
3541				if (vecLeft == 0) {
3542					vecOffset = 0;
3543					vecIndex++;
3544					continue;
3545				}
3546
3547				TRACE(("fill vec %" B_PRIu32 ", offset = %lu, size = %lu\n",
3548					vecIndex, vecOffset, size));
3549
3550				// actually available bytes
3551				size_t tempVecSize = min_c(vecLeft, fileLeft - size);
3552
3553				tempVecs[tempCount].iov_base
3554					= (void*)((addr_t)vecs[vecIndex].iov_base + vecOffset);
3555				tempVecs[tempCount].iov_len = tempVecSize;
3556				tempCount++;
3557
3558				size += tempVecSize;
3559				vecOffset += tempVecSize;
3560			}
3561
3562			size_t bytes = size;
3563
3564			if (fileOffset == -1) {
3565				if (doWrite) {
3566					panic("sparse write attempt: vnode %p", vnode);
3567					status = B_IO_ERROR;
3568				} else {
3569					// sparse read
3570					zero_iovecs(tempVecs, tempCount, bytes);
3571					status = B_OK;
3572				}
3573			} else if (doWrite) {
3574				status = FS_CALL(vnode, write_pages, cookie, fileOffset,
3575					tempVecs, tempCount, &bytes);
3576			} else {
3577				status = FS_CALL(vnode, read_pages, cookie, fileOffset,
3578					tempVecs, tempCount, &bytes);
3579			}
3580			if (status != B_OK)
3581				return status;
3582
3583			totalSize += bytes;
3584			bytesLeft -= size;
3585			if (fileOffset >= 0)
3586				fileOffset += size;
3587			fileLeft -= size;
3588			//dprintf("-> file left = %Lu\n", fileLeft);
3589
3590			if (size != bytes || vecIndex >= vecCount) {
3591				// there are no more bytes or iovecs, let's bail out
3592				*_numBytes = totalSize;
3593				return B_OK;
3594			}
3595		}
3596	}
3597
3598	*_vecIndex = vecIndex;
3599	*_vecOffset = vecOffset;
3600	*_numBytes = totalSize;
3601	return B_OK;
3602}
3603
3604
3605static bool
3606is_user_in_group(gid_t gid)
3607{
3608	if (gid == getegid())
3609		return true;
3610
3611	gid_t groups[NGROUPS_MAX];
3612	int groupCount = getgroups(NGROUPS_MAX, groups);
3613	for (int i = 0; i < groupCount; i++) {
3614		if (gid == groups[i])
3615			return true;
3616	}
3617
3618	return false;
3619}
3620
3621
3622static status_t
3623free_io_context(io_context* context)
3624{
3625	uint32 i;
3626
3627	TIOC(FreeIOContext(context));
3628
3629	if (context->root)
3630		put_vnode(context->root);
3631
3632	if (context->cwd)
3633		put_vnode(context->cwd);
3634
3635	mutex_lock(&context->io_mutex);
3636
3637	for (i = 0; i < context->table_size; i++) {
3638		if (struct file_descriptor* descriptor = context->fds[i]) {
3639			close_fd(context, descriptor);
3640			put_fd(descriptor);
3641		}
3642	}
3643
3644	mutex_destroy(&context->io_mutex);
3645
3646	remove_node_monitors(context);
3647	free(context->fds);
3648	free(context);
3649
3650	return B_OK;
3651}
3652
3653
3654static status_t
3655resize_monitor_table(struct io_context* context, const int newSize)
3656{
3657	int	status = B_OK;
3658
3659	if (newSize <= 0 || newSize > MAX_NODE_MONITORS)
3660		return B_BAD_VALUE;
3661
3662	mutex_lock(&context->io_mutex);
3663
3664	if ((size_t)newSize < context->num_monitors) {
3665		status = B_BUSY;
3666		goto out;
3667	}
3668	context->max_monitors = newSize;
3669
3670out:
3671	mutex_unlock(&context->io_mutex);
3672	return status;
3673}
3674
3675
3676//	#pragma mark - public API for file systems
3677
3678
3679extern "C" status_t
3680new_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3681	fs_vnode_ops* ops)
3682{
3683	FUNCTION(("new_vnode(volume = %p (%" B_PRId32 "), vnodeID = %" B_PRId64
3684		", node = %p)\n", volume, volume->id, vnodeID, privateNode));
3685
3686	if (privateNode == NULL)
3687		return B_BAD_VALUE;
3688
3689	int32 tries = BUSY_VNODE_RETRIES;
3690restart:
3691	// create the node
3692	bool nodeCreated;
3693	struct vnode* vnode;
3694	status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3695		nodeCreated);
3696	if (status != B_OK)
3697		return status;
3698
3699	WriteLocker nodeLocker(sVnodeLock, true);
3700		// create_new_vnode_and_lock() has locked for us
3701
3702	if (!nodeCreated && vnode->IsBusy()) {
3703		nodeLocker.Unlock();
3704		if (!retry_busy_vnode(tries, volume->id, vnodeID))
3705			return B_BUSY;
3706		goto restart;
3707	}
3708
3709	// file system integrity check:
3710	// test if the vnode already exists and bail out if this is the case!
3711	if (!nodeCreated) {
3712		panic("vnode %" B_PRIdDEV ":%" B_PRIdINO " already exists (node = %p, "
3713			"vnode->node = %p)!", volume->id, vnodeID, privateNode,
3714			vnode->private_node);
3715		return B_ERROR;
3716	}
3717
3718	vnode->private_node = privateNode;
3719	vnode->ops = ops;
3720	vnode->SetUnpublished(true);
3721
3722	TRACE(("returns: %s\n", strerror(status)));
3723
3724	return status;
3725}
3726
3727
3728extern "C" status_t
3729publish_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3730	fs_vnode_ops* ops, int type, uint32 flags)
3731{
3732	FUNCTION(("publish_vnode()\n"));
3733
3734	int32 tries = BUSY_VNODE_RETRIES;
3735restart:
3736	WriteLocker locker(sVnodeLock);
3737
3738	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3739
3740	bool nodeCreated = false;
3741	if (vnode == NULL) {
3742		if (privateNode == NULL)
3743			return B_BAD_VALUE;
3744
3745		// create the node
3746		locker.Unlock();
3747			// create_new_vnode_and_lock() will re-lock for us on success
3748		status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3749			nodeCreated);
3750		if (status != B_OK)
3751			return status;
3752
3753		locker.SetTo(sVnodeLock, true);
3754	}
3755
3756	if (nodeCreated) {
3757		vnode->private_node = privateNode;
3758		vnode->ops = ops;
3759		vnode->SetUnpublished(true);
3760	} else if (vnode->IsBusy() && vnode->IsUnpublished()
3761		&& vnode->private_node == privateNode && vnode->ops == ops) {
3762		// already known, but not published
3763	} else if (vnode->IsBusy()) {
3764		locker.Unlock();
3765		if (!retry_busy_vnode(tries, volume->id, vnodeID))
3766			return B_BUSY;
3767		goto restart;
3768	} else
3769		return B_BAD_VALUE;
3770
3771	bool publishSpecialSubNode = false;
3772
3773	vnode->SetType(type);
3774	vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
3775	publishSpecialSubNode = is_special_node_type(type)
3776		&& (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
3777
3778	status_t status = B_OK;
3779
3780	// create sub vnodes, if necessary
3781	if (volume->sub_volume != NULL || publishSpecialSubNode) {
3782		locker.Unlock();
3783
3784		fs_volume* subVolume = volume;
3785		if (volume->sub_volume != NULL) {
3786			while (status == B_OK && subVolume->sub_volume != NULL) {
3787				subVolume = subVolume->sub_volume;
3788				status = subVolume->ops->create_sub_vnode(subVolume, vnodeID,
3789					vnode);
3790			}
3791		}
3792
3793		if (status == B_OK && publishSpecialSubNode)
3794			status = create_special_sub_node(vnode, flags);
3795
3796		if (status != B_OK) {
3797			// error -- clean up the created sub vnodes
3798			while (subVolume->super_volume != volume) {
3799				subVolume = subVolume->super_volume;
3800				subVolume->ops->delete_sub_vnode(subVolume, vnode);
3801			}
3802		}
3803
3804		if (status == B_OK) {
3805			ReadLocker vnodesReadLocker(sVnodeLock);
3806			AutoLocker<Vnode> nodeLocker(vnode);
3807			vnode->SetBusy(false);
3808			vnode->SetUnpublished(false);
3809		} else {
3810			locker.Lock();
3811			sVnodeTable->Remove(vnode);
3812			remove_vnode_from_mount_list(vnode, vnode->mount);
3813			object_cache_free(sVnodeCache, vnode, 0);
3814		}
3815	} else {
3816		// we still hold the write lock -- mark the node unbusy and published
3817		vnode->SetBusy(false);
3818		vnode->SetUnpublished(false);
3819	}
3820
3821	TRACE(("returns: %s\n", strerror(status)));
3822
3823	return status;
3824}
3825
3826
3827extern "C" status_t
3828get_vnode(fs_volume* volume, ino_t vnodeID, void** _privateNode)
3829{
3830	struct vnode* vnode;
3831
3832	if (volume == NULL)
3833		return B_BAD_VALUE;
3834
3835	status_t status = get_vnode(volume->id, vnodeID, &vnode, true, true);
3836	if (status != B_OK)
3837		return status;
3838
3839	// If this is a layered FS, we need to get the node cookie for the requested
3840	// layer.
3841	if (HAS_FS_CALL(vnode, get_super_vnode)) {
3842		fs_vnode resolvedNode;
3843		status_t status = FS_CALL(vnode, get_super_vnode, volume,
3844			&resolvedNode);
3845		if (status != B_OK) {
3846			panic("get_vnode(): Failed to get super node for vnode %p, "
3847				"volume: %p", vnode, volume);
3848			put_vnode(vnode);
3849			return status;
3850		}
3851
3852		if (_privateNode != NULL)
3853			*_privateNode = resolvedNode.private_node;
3854	} else if (_privateNode != NULL)
3855		*_privateNode = vnode->private_node;
3856
3857	return B_OK;
3858}
3859
3860
3861extern "C" status_t
3862acquire_vnode(fs_volume* volume, ino_t vnodeID)
3863{
3864	ReadLocker nodeLocker(sVnodeLock);
3865
3866	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3867	if (vnode == NULL)
3868		return B_BAD_VALUE;
3869
3870	inc_vnode_ref_count(vnode);
3871	return B_OK;
3872}
3873
3874
3875extern "C" status_t
3876put_vnode(fs_volume* volume, ino_t vnodeID)
3877{
3878	struct vnode* vnode;
3879
3880	rw_lock_read_lock(&sVnodeLock);
3881	vnode = lookup_vnode(volume->id, vnodeID);
3882	rw_lock_read_unlock(&sVnodeLock);
3883
3884	if (vnode == NULL)
3885		return B_BAD_VALUE;
3886
3887	dec_vnode_ref_count(vnode, false, true);
3888	return B_OK;
3889}
3890
3891
3892extern "C" status_t
3893remove_vnode(fs_volume* volume, ino_t vnodeID)
3894{
3895	ReadLocker locker(sVnodeLock);
3896
3897	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3898	if (vnode == NULL)
3899		return B_ENTRY_NOT_FOUND;
3900
3901	if (vnode->covered_by != NULL || vnode->covers != NULL) {
3902		// this vnode is in use
3903		return B_BUSY;
3904	}
3905
3906	vnode->Lock();
3907
3908	vnode->SetRemoved(true);
3909	bool removeUnpublished = false;
3910
3911	if (vnode->IsUnpublished()) {
3912		// prepare the vnode for deletion
3913		removeUnpublished = true;
3914		vnode->SetBusy(true);
3915	}
3916
3917	vnode->Unlock();
3918	locker.Unlock();
3919
3920	if (removeUnpublished) {
3921		// If the vnode hasn't been published yet, we delete it here
3922		atomic_add(&vnode->ref_count, -1);
3923		free_vnode(vnode, true);
3924	}
3925
3926	return B_OK;
3927}
3928
3929
3930extern "C" status_t
3931unremove_vnode(fs_volume* volume, ino_t vnodeID)
3932{
3933	struct vnode* vnode;
3934
3935	rw_lock_read_lock(&sVnodeLock);
3936
3937	vnode = lookup_vnode(volume->id, vnodeID);
3938	if (vnode) {
3939		AutoLocker<Vnode> nodeLocker(vnode);
3940		vnode->SetRemoved(false);
3941	}
3942
3943	rw_lock_read_unlock(&sVnodeLock);
3944	return B_OK;
3945}
3946
3947
3948extern "C" status_t
3949get_vnode_removed(fs_volume* volume, ino_t vnodeID, bool* _removed)
3950{
3951	ReadLocker _(sVnodeLock);
3952
3953	if (struct vnode* vnode = lookup_vnode(volume->id, vnodeID)) {
3954		if (_removed != NULL)
3955			*_removed = vnode->IsRemoved();
3956		return B_OK;
3957	}
3958
3959	return B_BAD_VALUE;
3960}
3961
3962
3963extern "C" fs_volume*
3964volume_for_vnode(fs_vnode* _vnode)
3965{
3966	if (_vnode == NULL)
3967		return NULL;
3968
3969	struct vnode* vnode = static_cast<struct vnode*>(_vnode);
3970	return vnode->mount->volume;
3971}
3972
3973
3974extern "C" status_t
3975check_access_permissions(int accessMode, mode_t mode, gid_t nodeGroupID,
3976	uid_t nodeUserID)
3977{
3978	// get node permissions
3979	int userPermissions = (mode & S_IRWXU) >> 6;
3980	int groupPermissions = (mode & S_IRWXG) >> 3;
3981	int otherPermissions = mode & S_IRWXO;
3982
3983	// get the node permissions for this uid/gid
3984	int permissions = 0;
3985	uid_t uid = geteuid();
3986
3987	if (uid == 0) {
3988		// user is root
3989		// root has always read/write permission, but at least one of the
3990		// X bits must be set for execute permission
3991		permissions = userPermissions | groupPermissions | otherPermissions
3992			| S_IROTH | S_IWOTH;
3993		if (S_ISDIR(mode))
3994			permissions |= S_IXOTH;
3995	} else if (uid == nodeUserID) {
3996		// user is node owner
3997		permissions = userPermissions;
3998	} else if (is_user_in_group(nodeGroupID)) {
3999		// user is in owning group
4000		permissions = groupPermissions;
4001	} else {
4002		// user is one of the others
4003		permissions = otherPermissions;
4004	}
4005
4006	return (accessMode & ~permissions) == 0 ? B_OK : B_PERMISSION_DENIED;
4007}
4008
4009
4010#if 0
4011extern "C" status_t
4012read_pages(int fd, off_t pos, const iovec* vecs, size_t count,
4013	size_t* _numBytes)
4014{
4015	struct file_descriptor* descriptor;
4016	struct vnode* vnode;
4017
4018	descriptor = get_fd_and_vnode(fd, &vnode, true);
4019	if (descriptor == NULL)
4020		return B_FILE_ERROR;
4021
4022	status_t status = vfs_read_pages(vnode, descriptor->cookie, pos, vecs,
4023		count, 0, _numBytes);
4024
4025	put_fd(descriptor);
4026	return status;
4027}
4028
4029
4030extern "C" status_t
4031write_pages(int fd, off_t pos, const iovec* vecs, size_t count,
4032	size_t* _numBytes)
4033{
4034	struct file_descriptor* descriptor;
4035	struct vnode* vnode;
4036
4037	descriptor = get_fd_and_vnode(fd, &vnode, true);
4038	if (descriptor == NULL)
4039		return B_FILE_ERROR;
4040
4041	status_t status = vfs_write_pages(vnode, descriptor->cookie, pos, vecs,
4042		count, 0, _numBytes);
4043
4044	put_fd(descriptor);
4045	return status;
4046}
4047#endif
4048
4049
4050extern "C" status_t
4051read_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
4052	const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
4053	size_t* _bytes)
4054{
4055	struct vnode* vnode;
4056	FileDescriptorPutter descriptor(get_fd_and_vnode(fd, &vnode, true));
4057	if (!descriptor.IsSet())
4058		return B_FILE_ERROR;
4059
4060	status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
4061		fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
4062		false);
4063
4064	return status;
4065}
4066
4067
4068extern "C" status_t
4069write_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
4070	const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
4071	size_t* _bytes)
4072{
4073	struct vnode* vnode;
4074	FileDescriptorPutter descriptor(get_fd_and_vnode(fd, &vnode, true));
4075	if (!descriptor.IsSet())
4076		return B_FILE_ERROR;
4077
4078	status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
4079		fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
4080		true);
4081
4082	return status;
4083}
4084
4085
4086extern "C" status_t
4087entry_cache_add(dev_t mountID, ino_t dirID, const char* name, ino_t nodeID)
4088{
4089	// lookup mount -- the caller is required to make sure that the mount
4090	// won't go away
4091	ReadLocker locker(sMountLock);
4092	struct fs_mount* mount = find_mount(mountID);
4093	if (mount == NULL)
4094		return B_BAD_VALUE;
4095	locker.Unlock();
4096
4097	return mount->entry_cache.Add(dirID, name, nodeID, false);
4098}
4099
4100
4101extern "C" status_t
4102entry_cache_add_missing(dev_t mountID, ino_t dirID, const char* name)
4103{
4104	// lookup mount -- the caller is required to make sure that the mount
4105	// won't go away
4106	ReadLocker locker(sMountLock);
4107	struct fs_mount* mount = find_mount(mountID);
4108	if (mount == NULL)
4109		return B_BAD_VALUE;
4110	locker.Unlock();
4111
4112	return mount->entry_cache.Add(dirID, name, -1, true);
4113}
4114
4115
4116extern "C" status_t
4117entry_cache_remove(dev_t mountID, ino_t dirID, const char* name)
4118{
4119	// lookup mount -- the caller is required to make sure that the mount
4120	// won't go away
4121	ReadLocker locker(sMountLock);
4122	struct fs_mount* mount = find_mount(mountID);
4123	if (mount == NULL)
4124		return B_BAD_VALUE;
4125	locker.Unlock();
4126
4127	return mount->entry_cache.Remove(dirID, name);
4128}
4129
4130
4131//	#pragma mark - private VFS API
4132//	Functions the VFS exports for other parts of the kernel
4133
4134
4135/*! Acquires another reference to the vnode that has to be released
4136	by calling vfs_put_vnode().
4137*/
4138void
4139vfs_acquire_vnode(struct vnode* vnode)
4140{
4141	inc_vnode_ref_count(vnode);
4142}
4143
4144
4145/*! This is currently called from file_cache_create() only.
4146	It's probably a temporary solution as long as devfs requires that
4147	fs_read_pages()/fs_write_pages() are called with the standard
4148	open cookie and not with a device cookie.
4149	If that's done differently, remove this call; it has no other
4150	purpose.
4151*/
4152extern "C" status_t
4153vfs_get_cookie_from_fd(int fd, void** _cookie)
4154{
4155	struct file_descriptor* descriptor;
4156
4157	descriptor = get_fd(get_current_io_context(true), fd);
4158	if (descriptor == NULL)
4159		return B_FILE_ERROR;
4160
4161	*_cookie = descriptor->cookie;
4162	return B_OK;
4163}
4164
4165
4166extern "C" status_t
4167vfs_get_vnode_from_fd(int fd, bool kernel, struct vnode** vnode)
4168{
4169	*vnode = get_vnode_from_fd(fd, kernel);
4170
4171	if (*vnode == NULL)
4172		return B_FILE_ERROR;
4173
4174	return B_NO_ERROR;
4175}
4176
4177
4178extern "C" status_t
4179vfs_get_vnode_from_path(const char* path, bool kernel, struct vnode** _vnode)
4180{
4181	TRACE(("vfs_get_vnode_from_path: entry. path = '%s', kernel %d\n",
4182		path, kernel));
4183
4184	KPath pathBuffer;
4185	if (pathBuffer.InitCheck() != B_OK)
4186		return B_NO_MEMORY;
4187
4188	char* buffer = pathBuffer.LockBuffer();
4189	strlcpy(buffer, path, pathBuffer.BufferSize());
4190
4191	VnodePutter vnode;
4192	status_t status = path_to_vnode(buffer, true, vnode, NULL, kernel);
4193	if (status != B_OK)
4194		return status;
4195
4196	*_vnode = vnode.Detach();
4197	return B_OK;
4198}
4199
4200
4201extern "C" status_t
4202vfs_get_vnode(dev_t mountID, ino_t vnodeID, bool canWait, struct vnode** _vnode)
4203{
4204	struct vnode* vnode = NULL;
4205
4206	status_t status = get_vnode(mountID, vnodeID, &vnode, canWait, false);
4207	if (status != B_OK)
4208		return status;
4209
4210	*_vnode = vnode;
4211	return B_OK;
4212}
4213
4214
4215extern "C" status_t
4216vfs_entry_ref_to_vnode(dev_t mountID, ino_t directoryID,
4217	const char* name, struct vnode** _vnode)
4218{
4219	VnodePutter vnode;
4220	status_t status = entry_ref_to_vnode(mountID, directoryID, name, false, true, vnode);
4221	*_vnode = vnode.Detach();
4222	return status;
4223}
4224
4225
4226extern "C" void
4227vfs_vnode_to_node_ref(struct vnode* vnode, dev_t* _mountID, ino_t* _vnodeID)
4228{
4229	*_mountID = vnode->device;
4230	*_vnodeID = vnode->id;
4231}
4232
4233
4234/*!
4235	Helper function abstracting the process of "converting" a given
4236	vnode-pointer to a fs_vnode-pointer.
4237	Currently only used in bindfs.
4238*/
4239extern "C" fs_vnode*
4240vfs_fsnode_for_vnode(struct vnode* vnode)
4241{
4242	return vnode;
4243}
4244
4245
4246/*!
4247	Calls fs_open() on the given vnode and returns a new
4248	file descriptor for it
4249*/
4250int
4251vfs_open_vnode(struct vnode* vnode, int openMode, bool kernel)
4252{
4253	return open_vnode(vnode, openMode, kernel);
4254}
4255
4256
4257/*!	Looks up a vnode with the given mount and vnode ID.
4258	Must only be used with "in-use" vnodes as it doesn't grab a reference
4259	to the node.
4260	It's currently only be used by file_cache_create().
4261*/
4262extern "C" status_t
4263vfs_lookup_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode)
4264{
4265	rw_lock_read_lock(&sVnodeLock);
4266	struct vnode* vnode = lookup_vnode(mountID, vnodeID);
4267	rw_lock_read_unlock(&sVnodeLock);
4268
4269	if (vnode == NULL)
4270		return B_ERROR;
4271
4272	*_vnode = vnode;
4273	return B_OK;
4274}
4275
4276
4277extern "C" status_t
4278vfs_get_fs_node_from_path(fs_volume* volume, const char* path,
4279	bool traverseLeafLink, bool kernel, void** _node)
4280{
4281	TRACE(("vfs_get_fs_node_from_path(volume = %p, path = \"%s\", kernel %d)\n",
4282		volume, path, kernel));
4283
4284	KPath pathBuffer;
4285	if (pathBuffer.InitCheck() != B_OK)
4286		return B_NO_MEMORY;
4287
4288	fs_mount* mount;
4289	status_t status = get_mount(volume->id, &mount);
4290	if (status != B_OK)
4291		return status;
4292
4293	char* buffer = pathBuffer.LockBuffer();
4294	strlcpy(buffer, path, pathBuffer.BufferSize());
4295
4296	VnodePutter vnode;
4297
4298	if (buffer[0] == '/')
4299		status = path_to_vnode(buffer, traverseLeafLink, vnode, NULL, kernel);
4300	else {
4301		inc_vnode_ref_count(mount->root_vnode);
4302			// vnode_path_to_vnode() releases a reference to the starting vnode
4303		status = vnode_path_to_vnode(mount->root_vnode, buffer, traverseLeafLink,
4304			kernel, vnode, NULL);
4305	}
4306
4307	put_mount(mount);
4308
4309	if (status != B_OK)
4310		return status;
4311
4312	if (vnode->device != volume->id) {
4313		// wrong mount ID - must not gain access on foreign file system nodes
4314		return B_BAD_VALUE;
4315	}
4316
4317	// Use get_vnode() to resolve the cookie for the right layer.
4318	status = get_vnode(volume, vnode->id, _node);
4319
4320	return status;
4321}
4322
4323
4324status_t
4325vfs_read_stat(int fd, const char* path, bool traverseLeafLink,
4326	struct stat* stat, bool kernel)
4327{
4328	status_t status;
4329
4330	if (path != NULL) {
4331		// path given: get the stat of the node referred to by (fd, path)
4332		KPath pathBuffer(path);
4333		if (pathBuffer.InitCheck() != B_OK)
4334			return B_NO_MEMORY;
4335
4336		status = common_path_read_stat(fd, pathBuffer.LockBuffer(),
4337			traverseLeafLink, stat, kernel);
4338	} else {
4339		// no path given: get the FD and use the FD operation
4340		FileDescriptorPutter descriptor
4341			(get_fd(get_current_io_context(kernel), fd));
4342		if (!descriptor.IsSet())
4343			return B_FILE_ERROR;
4344
4345		if (descriptor->ops->fd_read_stat)
4346			status = descriptor->ops->fd_read_stat(descriptor.Get(), stat);
4347		else
4348			status = B_UNSUPPORTED;
4349	}
4350
4351	return status;
4352}
4353
4354
4355/*!	Finds the full path to the file that contains the module \a moduleName,
4356	puts it into \a pathBuffer, and returns B_OK for success.
4357	If \a pathBuffer was too small, it returns \c B_BUFFER_OVERFLOW,
4358	\c B_ENTRY_NOT_FOUNT if no file could be found.
4359	\a pathBuffer is clobbered in any case and must not be relied on if this
4360	functions returns unsuccessfully.
4361	\a basePath and \a pathBuffer must not point to the same space.
4362*/
4363status_t
4364vfs_get_module_path(const char* basePath, const char* moduleName,
4365	char* pathBuffer, size_t bufferSize)
4366{
4367	status_t status;
4368	size_t length;
4369	char* path;
4370
4371	if (bufferSize == 0
4372		|| strlcpy(pathBuffer, basePath, bufferSize) >= bufferSize)
4373		return B_BUFFER_OVERFLOW;
4374
4375	VnodePutter dir;
4376	status = path_to_vnode(pathBuffer, true, dir, NULL, true);
4377	if (status != B_OK)
4378		return status;
4379
4380	// the path buffer had been clobbered by the above call
4381	length = strlcpy(pathBuffer, basePath, bufferSize);
4382	if (pathBuffer[length - 1] != '/')
4383		pathBuffer[length++] = '/';
4384
4385	path = pathBuffer + length;
4386	bufferSize -= length;
4387
4388	VnodePutter file;
4389	while (moduleName) {
4390		char* nextPath = strchr(moduleName, '/');
4391		if (nextPath == NULL)
4392			length = strlen(moduleName);
4393		else {
4394			length = nextPath - moduleName;
4395			nextPath++;
4396		}
4397
4398		if (length + 1 >= bufferSize)
4399			return B_BUFFER_OVERFLOW;
4400
4401		memcpy(path, moduleName, length);
4402		path[length] = '\0';
4403		moduleName = nextPath;
4404
4405		// vnode_path_to_vnode() assumes ownership of the passed dir
4406		status = vnode_path_to_vnode(dir.Detach(), path, true, true, file, NULL);
4407		if (status != B_OK)
4408			return status;
4409
4410		if (S_ISDIR(file->Type())) {
4411			// goto the next directory
4412			path[length] = '/';
4413			path[length + 1] = '\0';
4414			path += length + 1;
4415			bufferSize -= length + 1;
4416
4417			dir.SetTo(file.Detach());
4418		} else if (S_ISREG(file->Type())) {
4419			// it's a file so it should be what we've searched for
4420			return B_OK;
4421		} else {
4422			TRACE(("vfs_get_module_path(): something is strange here: "
4423				"0x%08" B_PRIx32 "...\n", file->Type()));
4424			return B_ERROR;
4425		}
4426	}
4427
4428	// if we got here, the moduleName just pointed to a directory, not to
4429	// a real module - what should we do in this case?
4430	return B_ENTRY_NOT_FOUND;
4431}
4432
4433
4434/*!	\brief Normalizes a given path.
4435
4436	The path must refer to an existing or non-existing entry in an existing
4437	directory, that is chopping off the leaf component the remaining path must
4438	refer to an existing directory.
4439
4440	The returned will be canonical in that it will be absolute, will not
4441	contain any "." or ".." components or duplicate occurrences of '/'s,
4442	and none of the directory components will by symbolic links.
4443
4444	Any two paths referring to the same entry, will result in the same
4445	normalized path (well, that is pretty much the definition of `normalized',
4446	isn't it :-).
4447
4448	\param path The path to be normalized.
4449	\param buffer The buffer into which the normalized path will be written.
4450		   May be the same one as \a path.
4451	\param bufferSize The size of \a buffer.
4452	\param traverseLink If \c true, the function also resolves leaf symlinks.
4453	\param kernel \c true, if the IO context of the kernel shall be used,
4454		   otherwise that of the team this thread belongs to. Only relevant,
4455		   if the path is relative (to get the CWD).
4456	\return \c B_OK if everything went fine, another error code otherwise.
4457*/
4458status_t
4459vfs_normalize_path(const char* path, char* buffer, size_t bufferSize,
4460	bool traverseLink, bool kernel)
4461{
4462	if (!path || !buffer || bufferSize < 1)
4463		return B_BAD_VALUE;
4464
4465	if (path != buffer) {
4466		if (strlcpy(buffer, path, bufferSize) >= bufferSize)
4467			return B_BUFFER_OVERFLOW;
4468	}
4469
4470	return normalize_path(buffer, bufferSize, traverseLink, kernel);
4471}
4472
4473
4474/*!	\brief Gets the parent of the passed in node.
4475
4476	Gets the parent of the passed in node, and correctly resolves covered
4477	nodes.
4478*/
4479extern "C" status_t
4480vfs_resolve_parent(struct vnode* parent, dev_t* device, ino_t* node)
4481{
4482	return resolve_covered_parent(parent, device, node,
4483		get_current_io_context(true));
4484}
4485
4486
4487/*!	\brief Creates a special node in the file system.
4488
4489	The caller gets a reference to the newly created node (which is passed
4490	back through \a _createdVnode) and is responsible for releasing it.
4491
4492	\param path The path where to create the entry for the node. Can be \c NULL,
4493		in which case the node is created without an entry in the root FS -- it
4494		will automatically be deleted when the last reference has been released.
4495	\param subVnode The definition of the subnode. Can be \c NULL, in which case
4496		the target file system will just create the node with its standard
4497		operations. Depending on the type of the node a subnode might be created
4498		automatically, though.
4499	\param mode The type and permissions for the node to be created.
4500	\param flags Flags to be passed to the creating FS.
4501	\param kernel \c true, if called in the kernel context (relevant only if
4502		\a path is not \c NULL and not absolute).
4503	\param _superVnode Pointer to a pre-allocated structure to be filled by the
4504		file system creating the node, with the private data pointer and
4505		operations for the super node. Can be \c NULL.
4506	\param _createVnode Pointer to pre-allocated storage where to store the
4507		pointer to the newly created node.
4508	\return \c B_OK, if everything went fine, another error code otherwise.
4509*/
4510status_t
4511vfs_create_special_node(const char* path, fs_vnode* subVnode, mode_t mode,
4512	uint32 flags, bool kernel, fs_vnode* _superVnode,
4513	struct vnode** _createdVnode)
4514{
4515	VnodePutter dirNode;
4516	char _leaf[B_FILE_NAME_LENGTH];
4517	char* leaf = NULL;
4518
4519	if (path) {
4520		// We've got a path. Get the dir vnode and the leaf name.
4521		KPath tmpPathBuffer;
4522		if (tmpPathBuffer.InitCheck() != B_OK)
4523			return B_NO_MEMORY;
4524
4525		char* tmpPath = tmpPathBuffer.LockBuffer();
4526		if (strlcpy(tmpPath, path, B_PATH_NAME_LENGTH) >= B_PATH_NAME_LENGTH)
4527			return B_NAME_TOO_LONG;
4528
4529		// get the dir vnode and the leaf name
4530		leaf = _leaf;
4531		status_t error = path_to_dir_vnode(tmpPath, dirNode, leaf, kernel);
4532		if (error != B_OK)
4533			return error;
4534	} else {
4535		// No path. Create the node in the root FS.
4536		dirNode.SetTo(sRoot);
4537		inc_vnode_ref_count(dirNode.Get());
4538	}
4539
4540	// check support for creating special nodes
4541	if (!HAS_FS_CALL(dirNode, create_special_node))
4542		return B_UNSUPPORTED;
4543
4544	// create the node
4545	fs_vnode superVnode;
4546	ino_t nodeID;
4547	status_t status = FS_CALL(dirNode.Get(), create_special_node, leaf, subVnode,
4548		mode, flags, _superVnode != NULL ? _superVnode : &superVnode, &nodeID);
4549	if (status != B_OK)
4550		return status;
4551
4552	// lookup the node
4553	rw_lock_read_lock(&sVnodeLock);
4554	*_createdVnode = lookup_vnode(dirNode->mount->id, nodeID);
4555	rw_lock_read_unlock(&sVnodeLock);
4556
4557	if (*_createdVnode == NULL) {
4558		panic("vfs_create_special_node(): lookup of node failed");
4559		return B_ERROR;
4560	}
4561
4562	return B_OK;
4563}
4564
4565
4566extern "C" void
4567vfs_put_vnode(struct vnode* vnode)
4568{
4569	put_vnode(vnode);
4570}
4571
4572
4573extern "C" status_t
4574vfs_get_cwd(dev_t* _mountID, ino_t* _vnodeID)
4575{
4576	// Get current working directory from io context
4577	struct io_context* context = get_current_io_context(false);
4578	status_t status = B_OK;
4579
4580	mutex_lock(&context->io_mutex);
4581
4582	if (context->cwd != NULL) {
4583		*_mountID = context->cwd->device;
4584		*_vnodeID = context->cwd->id;
4585	} else
4586		status = B_ERROR;
4587
4588	mutex_unlock(&context->io_mutex);
4589	return status;
4590}
4591
4592
4593status_t
4594vfs_unmount(dev_t mountID, uint32 flags)
4595{
4596	return fs_unmount(NULL, mountID, flags, true);
4597}
4598
4599
4600extern "C" status_t
4601vfs_disconnect_vnode(dev_t mountID, ino_t vnodeID)
4602{
4603	struct vnode* vnode;
4604
4605	status_t status = get_vnode(mountID, vnodeID, &vnode, true, true);
4606	if (status != B_OK)
4607		return status;
4608
4609	disconnect_mount_or_vnode_fds(vnode->mount, vnode);
4610	put_vnode(vnode);
4611	return B_OK;
4612}
4613
4614
4615extern "C" void
4616vfs_free_unused_vnodes(int32 level)
4617{
4618	vnode_low_resource_handler(NULL,
4619		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
4620			| B_KERNEL_RESOURCE_ADDRESS_SPACE,
4621		level);
4622}
4623
4624
4625extern "C" bool
4626vfs_can_page(struct vnode* vnode, void* cookie)
4627{
4628	FUNCTION(("vfs_canpage: vnode %p\n", vnode));
4629
4630	if (HAS_FS_CALL(vnode, can_page))
4631		return FS_CALL(vnode, can_page, cookie);
4632	return false;
4633}
4634
4635
4636extern "C" status_t
4637vfs_read_pages(struct vnode* vnode, void* cookie, off_t pos,
4638	const generic_io_vec* vecs, size_t count, uint32 flags,
4639	generic_size_t* _numBytes)
4640{
4641	FUNCTION(("vfs_read_pages: vnode %p, vecs %p, pos %" B_PRIdOFF "\n", vnode,
4642		vecs, pos));
4643
4644#if VFS_PAGES_IO_TRACING
4645	generic_size_t bytesRequested = *_numBytes;
4646#endif
4647
4648	IORequest request;
4649	status_t status = request.Init(pos, vecs, count, *_numBytes, false, flags);
4650	if (status == B_OK) {
4651		status = vfs_vnode_io(vnode, cookie, &request);
4652		if (status == B_OK)
4653			status = request.Wait();
4654		*_numBytes = request.TransferredBytes();
4655	}
4656
4657	TPIO(ReadPages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4658		status, *_numBytes));
4659
4660	return status;
4661}
4662
4663
4664extern "C" status_t
4665vfs_write_pages(struct vnode* vnode, void* cookie, off_t pos,
4666	const generic_io_vec* vecs, size_t count, uint32 flags,
4667	generic_size_t* _numBytes)
4668{
4669	FUNCTION(("vfs_write_pages: vnode %p, vecs %p, pos %" B_PRIdOFF "\n", vnode,
4670		vecs, pos));
4671
4672#if VFS_PAGES_IO_TRACING
4673	generic_size_t bytesRequested = *_numBytes;
4674#endif
4675
4676	IORequest request;
4677	status_t status = request.Init(pos, vecs, count, *_numBytes, true, flags);
4678	if (status == B_OK) {
4679		status = vfs_vnode_io(vnode, cookie, &request);
4680		if (status == B_OK)
4681			status = request.Wait();
4682		*_numBytes = request.TransferredBytes();
4683	}
4684
4685	TPIO(WritePages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4686		status, *_numBytes));
4687
4688	return status;
4689}
4690
4691
4692/*!	Gets the vnode's VMCache object. If it didn't have one, it will be
4693	created if \a allocate is \c true.
4694	In case it's successful, it will also grab a reference to the cache
4695	it returns.
4696*/
4697extern "C" status_t
4698vfs_get_vnode_cache(struct vnode* vnode, VMCache** _cache, bool allocate)
4699{
4700	if (vnode->cache != NULL) {
4701		vnode->cache->AcquireRef();
4702		*_cache = vnode->cache;
4703		return B_OK;
4704	}
4705
4706	rw_lock_read_lock(&sVnodeLock);
4707	vnode->Lock();
4708
4709	status_t status = B_OK;
4710
4711	// The cache could have been created in the meantime
4712	if (vnode->cache == NULL) {
4713		if (allocate) {
4714			// TODO: actually the vnode needs to be busy already here, or
4715			//	else this won't work...
4716			bool wasBusy = vnode->IsBusy();
4717			vnode->SetBusy(true);
4718
4719			vnode->Unlock();
4720			rw_lock_read_unlock(&sVnodeLock);
4721
4722			status = vm_create_vnode_cache(vnode, &vnode->cache);
4723
4724			rw_lock_read_lock(&sVnodeLock);
4725			vnode->Lock();
4726			vnode->SetBusy(wasBusy);
4727		} else
4728			status = B_BAD_VALUE;
4729	}
4730
4731	vnode->Unlock();
4732	rw_lock_read_unlock(&sVnodeLock);
4733
4734	if (status == B_OK) {
4735		vnode->cache->AcquireRef();
4736		*_cache = vnode->cache;
4737	}
4738
4739	return status;
4740}
4741
4742
4743/*!	Sets the vnode's VMCache object, for subsystems that want to manage
4744	their own.
4745	In case it's successful, it will also grab a reference to the cache
4746	it returns.
4747*/
4748extern "C" status_t
4749vfs_set_vnode_cache(struct vnode* vnode, VMCache* _cache)
4750{
4751	rw_lock_read_lock(&sVnodeLock);
4752	vnode->Lock();
4753
4754	status_t status = B_OK;
4755	if (vnode->cache != NULL) {
4756		status = B_NOT_ALLOWED;
4757	} else {
4758		vnode->cache = _cache;
4759		_cache->AcquireRef();
4760	}
4761
4762	vnode->Unlock();
4763	rw_lock_read_unlock(&sVnodeLock);
4764	return status;
4765}
4766
4767
4768status_t
4769vfs_get_file_map(struct vnode* vnode, off_t offset, size_t size,
4770	file_io_vec* vecs, size_t* _count)
4771{
4772	FUNCTION(("vfs_get_file_map: vnode %p, vecs %p, offset %" B_PRIdOFF
4773		", size = %" B_PRIuSIZE "\n", vnode, vecs, offset, size));
4774
4775	return FS_CALL(vnode, get_file_map, offset, size, vecs, _count);
4776}
4777
4778
4779status_t
4780vfs_stat_vnode(struct vnode* vnode, struct stat* stat)
4781{
4782	status_t status = FS_CALL(vnode, read_stat, stat);
4783
4784	// fill in the st_dev and st_ino fields
4785	if (status == B_OK) {
4786		stat->st_dev = vnode->device;
4787		stat->st_ino = vnode->id;
4788		// the rdev field must stay unset for non-special files
4789		if (!S_ISBLK(stat->st_mode) && !S_ISCHR(stat->st_mode))
4790			stat->st_rdev = -1;
4791	}
4792
4793	return status;
4794}
4795
4796
4797status_t
4798vfs_stat_node_ref(dev_t device, ino_t inode, struct stat* stat)
4799{
4800	struct vnode* vnode;
4801	status_t status = get_vnode(device, inode, &vnode, true, false);
4802	if (status != B_OK)
4803		return status;
4804
4805	status = vfs_stat_vnode(vnode, stat);
4806
4807	put_vnode(vnode);
4808	return status;
4809}
4810
4811
4812status_t
4813vfs_get_vnode_name(struct vnode* vnode, char* name, size_t nameSize)
4814{
4815	return get_vnode_name(vnode, NULL, name, nameSize, true);
4816}
4817
4818
4819status_t
4820vfs_entry_ref_to_path(dev_t device, ino_t inode, const char* leaf,
4821	bool kernel, char* path, size_t pathLength)
4822{
4823	VnodePutter vnode;
4824	status_t status;
4825
4826	// filter invalid leaf names
4827	if (leaf != NULL && (leaf[0] == '\0' || strchr(leaf, '/')))
4828		return B_BAD_VALUE;
4829
4830	// get the vnode matching the dir's node_ref
4831	if (leaf && (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0)) {
4832		// special cases "." and "..": we can directly get the vnode of the
4833		// referenced directory
4834		status = entry_ref_to_vnode(device, inode, leaf, false, kernel, vnode);
4835		leaf = NULL;
4836	} else {
4837		struct vnode* temp = NULL;
4838		status = get_vnode(device, inode, &temp, true, false);
4839		vnode.SetTo(temp);
4840	}
4841	if (status != B_OK)
4842		return status;
4843
4844	// get the directory path
4845	status = dir_vnode_to_path(vnode.Get(), path, pathLength, kernel);
4846	vnode.Unset();
4847		// we don't need the vnode anymore
4848	if (status != B_OK)
4849		return status;
4850
4851	// append the leaf name
4852	if (leaf) {
4853		// insert a directory separator if this is not the file system root
4854		if ((strcmp(path, "/") && strlcat(path, "/", pathLength)
4855				>= pathLength)
4856			|| strlcat(path, leaf, pathLength) >= pathLength) {
4857			return B_NAME_TOO_LONG;
4858		}
4859	}
4860
4861	return B_OK;
4862}
4863
4864
4865/*!	If the given descriptor locked its vnode, that lock will be released. */
4866void
4867vfs_unlock_vnode_if_locked(struct file_descriptor* descriptor)
4868{
4869	struct vnode* vnode = fd_vnode(descriptor);
4870
4871	if (vnode != NULL && vnode->mandatory_locked_by == descriptor)
4872		vnode->mandatory_locked_by = NULL;
4873}
4874
4875
4876/*!	Releases any POSIX locks on the file descriptor. */
4877status_t
4878vfs_release_posix_lock(io_context* context, struct file_descriptor* descriptor)
4879{
4880	struct vnode* vnode = descriptor->u.vnode;
4881	if (vnode == NULL)
4882		return B_OK;
4883
4884	if (HAS_FS_CALL(vnode, release_lock))
4885		return FS_CALL(vnode, release_lock, descriptor->cookie, NULL);
4886
4887	return release_advisory_lock(vnode, context, NULL, NULL);
4888}
4889
4890
4891/*!	Closes all file descriptors of the specified I/O context that
4892	have the O_CLOEXEC flag set.
4893*/
4894void
4895vfs_exec_io_context(io_context* context)
4896{
4897	uint32 i;
4898
4899	for (i = 0; i < context->table_size; i++) {
4900		mutex_lock(&context->io_mutex);
4901
4902		struct file_descriptor* descriptor = context->fds[i];
4903		bool remove = false;
4904
4905		if (descriptor != NULL && fd_close_on_exec(context, i)) {
4906			context->fds[i] = NULL;
4907			context->num_used_fds--;
4908
4909			remove = true;
4910		}
4911
4912		mutex_unlock(&context->io_mutex);
4913
4914		if (remove) {
4915			close_fd(context, descriptor);
4916			put_fd(descriptor);
4917		}
4918	}
4919}
4920
4921
4922/*! Sets up a new io_control structure, and inherits the properties
4923	of the parent io_control if it is given.
4924*/
4925io_context*
4926vfs_new_io_context(io_context* parentContext, bool purgeCloseOnExec)
4927{
4928	io_context* context = (io_context*)malloc(sizeof(io_context));
4929	if (context == NULL)
4930		return NULL;
4931
4932	TIOC(NewIOContext(context, parentContext));
4933
4934	memset(context, 0, sizeof(io_context));
4935	context->ref_count = 1;
4936
4937	MutexLocker parentLocker;
4938
4939	size_t tableSize;
4940	if (parentContext != NULL) {
4941		parentLocker.SetTo(parentContext->io_mutex, false);
4942		tableSize = parentContext->table_size;
4943	} else
4944		tableSize = DEFAULT_FD_TABLE_SIZE;
4945
4946	// allocate space for FDs and their close-on-exec flag
4947	context->fds = (file_descriptor**)malloc(
4948		sizeof(struct file_descriptor*) * tableSize
4949		+ sizeof(struct select_info**) * tableSize
4950		+ (tableSize + 7) / 8);
4951	if (context->fds == NULL) {
4952		free(context);
4953		return NULL;
4954	}
4955
4956	context->select_infos = (select_info**)(context->fds + tableSize);
4957	context->fds_close_on_exec = (uint8*)(context->select_infos + tableSize);
4958
4959	memset(context->fds, 0, sizeof(struct file_descriptor*) * tableSize
4960		+ sizeof(struct select_info**) * tableSize
4961		+ (tableSize + 7) / 8);
4962
4963	mutex_init(&context->io_mutex, "I/O context");
4964
4965	// Copy all parent file descriptors
4966
4967	if (parentContext != NULL) {
4968		size_t i;
4969
4970		mutex_lock(&sIOContextRootLock);
4971		context->root = parentContext->root;
4972		if (context->root)
4973			inc_vnode_ref_count(context->root);
4974		mutex_unlock(&sIOContextRootLock);
4975
4976		context->cwd = parentContext->cwd;
4977		if (context->cwd)
4978			inc_vnode_ref_count(context->cwd);
4979
4980		if (parentContext->inherit_fds) {
4981			for (i = 0; i < tableSize; i++) {
4982				struct file_descriptor* descriptor = parentContext->fds[i];
4983
4984				if (descriptor != NULL
4985					&& (descriptor->open_mode & O_DISCONNECTED) == 0) {
4986					bool closeOnExec = fd_close_on_exec(parentContext, i);
4987					if (closeOnExec && purgeCloseOnExec)
4988						continue;
4989
4990					TFD(InheritFD(context, i, descriptor, parentContext));
4991
4992					context->fds[i] = descriptor;
4993					context->num_used_fds++;
4994					atomic_add(&descriptor->ref_count, 1);
4995					atomic_add(&descriptor->open_count, 1);
4996
4997					if (closeOnExec)
4998						fd_set_close_on_exec(context, i, true);
4999				}
5000			}
5001		}
5002
5003		parentLocker.Unlock();
5004	} else {
5005		context->root = sRoot;
5006		context->cwd = sRoot;
5007
5008		if (context->root)
5009			inc_vnode_ref_count(context->root);
5010
5011		if (context->cwd)
5012			inc_vnode_ref_count(context->cwd);
5013	}
5014
5015	context->table_size = tableSize;
5016	context->inherit_fds = parentContext != NULL;
5017
5018	list_init(&context->node_monitors);
5019	context->max_monitors = DEFAULT_NODE_MONITORS;
5020
5021	return context;
5022}
5023
5024
5025void
5026vfs_get_io_context(io_context* context)
5027{
5028	atomic_add(&context->ref_count, 1);
5029}
5030
5031
5032void
5033vfs_put_io_context(io_context* context)
5034{
5035	if (atomic_add(&context->ref_count, -1) == 1)
5036		free_io_context(context);
5037}
5038
5039
5040status_t
5041vfs_resize_fd_table(struct io_context* context, uint32 newSize)
5042{
5043	if (newSize == 0 || newSize > MAX_FD_TABLE_SIZE)
5044		return B_BAD_VALUE;
5045
5046	TIOC(ResizeIOContext(context, newSize));
5047
5048	MutexLocker _(context->io_mutex);
5049
5050	uint32 oldSize = context->table_size;
5051	int oldCloseOnExitBitmapSize = (oldSize + 7) / 8;
5052	int newCloseOnExitBitmapSize = (newSize + 7) / 8;
5053
5054	// If the tables shrink, make sure none of the fds being dropped are in use.
5055	if (newSize < oldSize) {
5056		for (uint32 i = oldSize; i-- > newSize;) {
5057			if (context->fds[i])
5058				return B_BUSY;
5059		}
5060	}
5061
5062	// store pointers to the old tables
5063	file_descriptor** oldFDs = context->fds;
5064	select_info** oldSelectInfos = context->select_infos;
5065	uint8* oldCloseOnExecTable = context->fds_close_on_exec;
5066
5067	// allocate new tables
5068	file_descriptor** newFDs = (file_descriptor**)malloc(
5069		sizeof(struct file_descriptor*) * newSize
5070		+ sizeof(struct select_infos**) * newSize
5071		+ newCloseOnExitBitmapSize);
5072	if (newFDs == NULL)
5073		return B_NO_MEMORY;
5074
5075	context->fds = newFDs;
5076	context->select_infos = (select_info**)(context->fds + newSize);
5077	context->fds_close_on_exec = (uint8*)(context->select_infos + newSize);
5078	context->table_size = newSize;
5079
5080	// copy entries from old tables
5081	uint32 toCopy = min_c(oldSize, newSize);
5082
5083	memcpy(context->fds, oldFDs, sizeof(void*) * toCopy);
5084	memcpy(context->select_infos, oldSelectInfos, sizeof(void*) * toCopy);
5085	memcpy(context->fds_close_on_exec, oldCloseOnExecTable,
5086		min_c(oldCloseOnExitBitmapSize, newCloseOnExitBitmapSize));
5087
5088	// clear additional entries, if the tables grow
5089	if (newSize > oldSize) {
5090		memset(context->fds + oldSize, 0, sizeof(void*) * (newSize - oldSize));
5091		memset(context->select_infos + oldSize, 0,
5092			sizeof(void*) * (newSize - oldSize));
5093		memset(context->fds_close_on_exec + oldCloseOnExitBitmapSize, 0,
5094			newCloseOnExitBitmapSize - oldCloseOnExitBitmapSize);
5095	}
5096
5097	free(oldFDs);
5098
5099	return B_OK;
5100}
5101
5102
5103/*!	\brief Resolves a vnode to the vnode it is covered by, if any.
5104
5105	Given an arbitrary vnode (identified by mount and node ID), the function
5106	checks, whether the vnode is covered by another vnode. If it is, the
5107	function returns the mount and node ID of the covering vnode. Otherwise
5108	it simply returns the supplied mount and node ID.
5109
5110	In case of error (e.g. the supplied node could not be found) the variables
5111	for storing the resolved mount and node ID remain untouched and an error
5112	code is returned.
5113
5114	\param mountID The mount ID of the vnode in question.
5115	\param nodeID The node ID of the vnode in question.
5116	\param resolvedMountID Pointer to storage for the resolved mount ID.
5117	\param resolvedNodeID Pointer to storage for the resolved node ID.
5118	\return
5119	- \c B_OK, if everything went fine,
5120	- another error code, if something went wrong.
5121*/
5122status_t
5123vfs_resolve_vnode_to_covering_vnode(dev_t mountID, ino_t nodeID,
5124	dev_t* resolvedMountID, ino_t* resolvedNodeID)
5125{
5126	// get the node
5127	struct vnode* node;
5128	status_t error = get_vnode(mountID, nodeID, &node, true, false);
5129	if (error != B_OK)
5130		return error;
5131
5132	// resolve the node
5133	if (Vnode* coveringNode = get_covering_vnode(node)) {
5134		put_vnode(node);
5135		node = coveringNode;
5136	}
5137
5138	// set the return values
5139	*resolvedMountID = node->device;
5140	*resolvedNodeID = node->id;
5141
5142	put_vnode(node);
5143
5144	return B_OK;
5145}
5146
5147
5148status_t
5149vfs_get_mount_point(dev_t mountID, dev_t* _mountPointMountID,
5150	ino_t* _mountPointNodeID)
5151{
5152	ReadLocker nodeLocker(sVnodeLock);
5153	ReadLocker mountLocker(sMountLock);
5154
5155	struct fs_mount* mount = find_mount(mountID);
5156	if (mount == NULL)
5157		return B_BAD_VALUE;
5158
5159	Vnode* mountPoint = mount->covers_vnode;
5160
5161	*_mountPointMountID = mountPoint->device;
5162	*_mountPointNodeID = mountPoint->id;
5163
5164	return B_OK;
5165}
5166
5167
5168status_t
5169vfs_bind_mount_directory(dev_t mountID, ino_t nodeID, dev_t coveredMountID,
5170	ino_t coveredNodeID)
5171{
5172	// get the vnodes
5173	Vnode* vnode;
5174	status_t error = get_vnode(mountID, nodeID, &vnode, true, false);
5175	if (error != B_OK)
5176		return B_BAD_VALUE;
5177	VnodePutter vnodePutter(vnode);
5178
5179	Vnode* coveredVnode;
5180	error = get_vnode(coveredMountID, coveredNodeID, &coveredVnode, true,
5181		false);
5182	if (error != B_OK)
5183		return B_BAD_VALUE;
5184	VnodePutter coveredVnodePutter(coveredVnode);
5185
5186	// establish the covered/covering links
5187	WriteLocker locker(sVnodeLock);
5188
5189	if (vnode->covers != NULL || coveredVnode->covered_by != NULL
5190		|| vnode->mount->unmounting || coveredVnode->mount->unmounting) {
5191		return B_BUSY;
5192	}
5193
5194	vnode->covers = coveredVnode;
5195	vnode->SetCovering(true);
5196
5197	coveredVnode->covered_by = vnode;
5198	coveredVnode->SetCovered(true);
5199
5200	// the vnodes do now reference each other
5201	inc_vnode_ref_count(vnode);
5202	inc_vnode_ref_count(coveredVnode);
5203
5204	return B_OK;
5205}
5206
5207
5208int
5209vfs_getrlimit(int resource, struct rlimit* rlp)
5210{
5211	if (!rlp)
5212		return B_BAD_ADDRESS;
5213
5214	switch (resource) {
5215		case RLIMIT_NOFILE:
5216		{
5217			struct io_context* context = get_current_io_context(false);
5218			MutexLocker _(context->io_mutex);
5219
5220			rlp->rlim_cur = context->table_size;
5221			rlp->rlim_max = MAX_FD_TABLE_SIZE;
5222			return 0;
5223		}
5224
5225		case RLIMIT_NOVMON:
5226		{
5227			struct io_context* context = get_current_io_context(false);
5228			MutexLocker _(context->io_mutex);
5229
5230			rlp->rlim_cur = context->max_monitors;
5231			rlp->rlim_max = MAX_NODE_MONITORS;
5232			return 0;
5233		}
5234
5235		default:
5236			return B_BAD_VALUE;
5237	}
5238}
5239
5240
5241int
5242vfs_setrlimit(int resource, const struct rlimit* rlp)
5243{
5244	if (!rlp)
5245		return B_BAD_ADDRESS;
5246
5247	switch (resource) {
5248		case RLIMIT_NOFILE:
5249			/* TODO: check getuid() */
5250			if (rlp->rlim_max != RLIM_SAVED_MAX
5251				&& rlp->rlim_max != MAX_FD_TABLE_SIZE)
5252				return B_NOT_ALLOWED;
5253
5254			return vfs_resize_fd_table(get_current_io_context(false),
5255				rlp->rlim_cur);
5256
5257		case RLIMIT_NOVMON:
5258			/* TODO: check getuid() */
5259			if (rlp->rlim_max != RLIM_SAVED_MAX
5260				&& rlp->rlim_max != MAX_NODE_MONITORS)
5261				return B_NOT_ALLOWED;
5262
5263			return resize_monitor_table(get_current_io_context(false),
5264				rlp->rlim_cur);
5265
5266		default:
5267			return B_BAD_VALUE;
5268	}
5269}
5270
5271
5272status_t
5273vfs_init(kernel_args* args)
5274{
5275	vnode::StaticInit();
5276
5277	sVnodeTable = new(std::nothrow) VnodeTable();
5278	if (sVnodeTable == NULL || sVnodeTable->Init(VNODE_HASH_TABLE_SIZE) != B_OK)
5279		panic("vfs_init: error creating vnode hash table\n");
5280
5281	struct vnode dummy_vnode;
5282	list_init_etc(&sUnusedVnodeList, offset_of_member(dummy_vnode, unused_link));
5283
5284	struct fs_mount dummyMount;
5285	sMountsTable = new(std::nothrow) MountTable();
5286	if (sMountsTable == NULL
5287			|| sMountsTable->Init(MOUNTS_HASH_TABLE_SIZE) != B_OK)
5288		panic("vfs_init: error creating mounts hash table\n");
5289
5290	sPathNameCache = create_object_cache("vfs path names",
5291		B_PATH_NAME_LENGTH + 1, 8, NULL, NULL, NULL);
5292	if (sPathNameCache == NULL)
5293		panic("vfs_init: error creating path name object_cache\n");
5294
5295	sVnodeCache = create_object_cache("vfs vnodes",
5296		sizeof(struct vnode), 8, NULL, NULL, NULL);
5297	if (sVnodeCache == NULL)
5298		panic("vfs_init: error creating vnode object_cache\n");
5299
5300	sFileDescriptorCache = create_object_cache("vfs fds",
5301		sizeof(file_descriptor), 8, NULL, NULL, NULL);
5302	if (sFileDescriptorCache == NULL)
5303		panic("vfs_init: error creating file descriptor object_cache\n");
5304
5305	node_monitor_init();
5306
5307	sRoot = NULL;
5308
5309	recursive_lock_init(&sMountOpLock, "vfs_mount_op_lock");
5310
5311	if (block_cache_init() != B_OK)
5312		return B_ERROR;
5313
5314#ifdef ADD_DEBUGGER_COMMANDS
5315	// add some debugger commands
5316	add_debugger_command_etc("vnode", &dump_vnode,
5317		"Print info about the specified vnode",
5318		"[ \"-p\" ] ( <vnode> | <devID> <nodeID> )\n"
5319		"Prints information about the vnode specified by address <vnode> or\n"
5320		"<devID>, <vnodeID> pair. If \"-p\" is given, a path of the vnode is\n"
5321		"constructed and printed. It might not be possible to construct a\n"
5322		"complete path, though.\n",
5323		0);
5324	add_debugger_command("vnodes", &dump_vnodes,
5325		"list all vnodes (from the specified device)");
5326	add_debugger_command("vnode_caches", &dump_vnode_caches,
5327		"list all vnode caches");
5328	add_debugger_command("mount", &dump_mount,
5329		"info about the specified fs_mount");
5330	add_debugger_command("mounts", &dump_mounts, "list all fs_mounts");
5331	add_debugger_command("io_context", &dump_io_context,
5332		"info about the I/O context");
5333	add_debugger_command("vnode_usage", &dump_vnode_usage,
5334		"info about vnode usage");
5335#endif
5336
5337	register_low_resource_handler(&vnode_low_resource_handler, NULL,
5338		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
5339			| B_KERNEL_RESOURCE_ADDRESS_SPACE,
5340		0);
5341
5342	fifo_init();
5343	file_map_init();
5344
5345	return file_cache_init();
5346}
5347
5348
5349//	#pragma mark - fd_ops implementations
5350
5351
5352/*!
5353	Calls fs_open() on the given vnode and returns a new
5354	file descriptor for it
5355*/
5356static int
5357open_vnode(struct vnode* vnode, int openMode, bool kernel)
5358{
5359	void* cookie;
5360	status_t status = FS_CALL(vnode, open, openMode, &cookie);
5361	if (status != B_OK)
5362		return status;
5363
5364	int fd = get_new_fd(FDTYPE_FILE, NULL, vnode, cookie, openMode, kernel);
5365	if (fd < 0) {
5366		FS_CALL(vnode, close, cookie);
5367		FS_CALL(vnode, free_cookie, cookie);
5368	}
5369	return fd;
5370}
5371
5372
5373/*!
5374	Calls fs_open() on the given vnode and returns a new
5375	file descriptor for it
5376*/
5377static int
5378create_vnode(struct vnode* directory, const char* name, int openMode,
5379	int perms, bool kernel)
5380{
5381	bool traverse = ((openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0);
5382	status_t status = B_ERROR;
5383	VnodePutter vnode, dirPutter;
5384	void* cookie;
5385	ino_t newID;
5386	char clonedName[B_FILE_NAME_LENGTH + 1];
5387
5388	// This is somewhat tricky: If the entry already exists, the FS responsible
5389	// for the directory might not necessarily also be the one responsible for
5390	// the node the entry refers to (e.g. in case of mount points or FIFOs). So
5391	// we can actually never call the create() hook without O_EXCL. Instead we
5392	// try to look the entry up first. If it already exists, we just open the
5393	// node (unless O_EXCL), otherwise we call create() with O_EXCL. This
5394	// introduces a race condition, since someone else might have created the
5395	// entry in the meantime. We hope the respective FS returns the correct
5396	// error code and retry (up to 3 times) again.
5397
5398	for (int i = 0; i < 3 && status != B_OK; i++) {
5399		bool create = false;
5400
5401		// look the node up
5402		{
5403			struct vnode* entry = NULL;
5404			status = lookup_dir_entry(directory, name, &entry);
5405			vnode.SetTo(entry);
5406		}
5407		if (status == B_OK) {
5408			if ((openMode & O_EXCL) != 0)
5409				return B_FILE_EXISTS;
5410
5411			// If the node is a symlink, we have to follow it, unless
5412			// O_NOTRAVERSE is set.
5413			if (S_ISLNK(vnode->Type()) && traverse) {
5414				vnode.Unset();
5415				if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH)
5416						>= B_FILE_NAME_LENGTH) {
5417					return B_NAME_TOO_LONG;
5418				}
5419
5420				inc_vnode_ref_count(directory);
5421				dirPutter.Unset();
5422				status = vnode_path_to_vnode(directory, clonedName, true,
5423					kernel, vnode, NULL, clonedName);
5424				if (status != B_OK) {
5425					// vnode is not found, but maybe it has a parent and we can create it from
5426					// there. In that case, vnode_path_to_vnode has set vnode to the latest
5427					// directory found in the path
5428					if (status == B_ENTRY_NOT_FOUND) {
5429						directory = vnode.Detach();
5430						dirPutter.SetTo(directory);
5431						name = clonedName;
5432						create = true;
5433					} else
5434						return status;
5435				}
5436			}
5437
5438			if (!create) {
5439				if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type()))
5440					return B_LINK_LIMIT;
5441
5442				int fd = open_vnode(vnode.Get(), openMode & ~O_CREAT, kernel);
5443				// on success keep the vnode reference for the FD
5444				if (fd >= 0)
5445					vnode.Detach();
5446
5447				return fd;
5448			}
5449		}
5450
5451		// it doesn't exist yet -- try to create it
5452
5453		if (!HAS_FS_CALL(directory, create))
5454			return B_READ_ONLY_DEVICE;
5455
5456		status = FS_CALL(directory, create, name, openMode | O_EXCL, perms,
5457			&cookie, &newID);
5458		if (status != B_OK
5459			&& ((openMode & O_EXCL) != 0 || status != B_FILE_EXISTS)) {
5460			return status;
5461		}
5462	}
5463
5464	if (status != B_OK)
5465		return status;
5466
5467	// the node has been created successfully
5468
5469	rw_lock_read_lock(&sVnodeLock);
5470	vnode.SetTo(lookup_vnode(directory->device, newID));
5471	rw_lock_read_unlock(&sVnodeLock);
5472
5473	if (!vnode.IsSet()) {
5474		panic("vfs: fs_create() returned success but there is no vnode, "
5475			"mount ID %" B_PRIdDEV "!\n", directory->device);
5476		return B_BAD_VALUE;
5477	}
5478
5479	int fd = get_new_fd(FDTYPE_FILE, NULL, vnode.Get(), cookie, openMode, kernel);
5480	if (fd >= 0) {
5481		vnode.Detach();
5482		return fd;
5483	}
5484
5485	status = fd;
5486
5487	// something went wrong, clean up
5488
5489	FS_CALL(vnode.Get(), close, cookie);
5490	FS_CALL(vnode.Get(), free_cookie, cookie);
5491
5492	FS_CALL(directory, unlink, name);
5493
5494	return status;
5495}
5496
5497
5498/*! Calls fs open_dir() on the given vnode and returns a new
5499	file descriptor for it
5500*/
5501static int
5502open_dir_vnode(struct vnode* vnode, bool kernel)
5503{
5504	if (!HAS_FS_CALL(vnode, open_dir))
5505		return B_UNSUPPORTED;
5506
5507	void* cookie;
5508	status_t status = FS_CALL(vnode, open_dir, &cookie);
5509	if (status != B_OK)
5510		return status;
5511
5512	// directory is opened, create a fd
5513	status = get_new_fd(FDTYPE_DIR, NULL, vnode, cookie, O_CLOEXEC, kernel);
5514	if (status >= 0)
5515		return status;
5516
5517	FS_CALL(vnode, close_dir, cookie);
5518	FS_CALL(vnode, free_dir_cookie, cookie);
5519
5520	return status;
5521}
5522
5523
5524/*! Calls fs open_attr_dir() on the given vnode and returns a new
5525	file descriptor for it.
5526	Used by attr_dir_open(), and attr_dir_open_fd().
5527*/
5528static int
5529open_attr_dir_vnode(struct vnode* vnode, bool kernel)
5530{
5531	if (!HAS_FS_CALL(vnode, open_attr_dir))
5532		return B_UNSUPPORTED;
5533
5534	void* cookie;
5535	status_t status = FS_CALL(vnode, open_attr_dir, &cookie);
5536	if (status != B_OK)
5537		return status;
5538
5539	// directory is opened, create a fd
5540	status = get_new_fd(FDTYPE_ATTR_DIR, NULL, vnode, cookie, O_CLOEXEC,
5541		kernel);
5542	if (status >= 0)
5543		return status;
5544
5545	FS_CALL(vnode, close_attr_dir, cookie);
5546	FS_CALL(vnode, free_attr_dir_cookie, cookie);
5547
5548	return status;
5549}
5550
5551
5552static int
5553file_create_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5554	int openMode, int perms, bool kernel)
5555{
5556	FUNCTION(("file_create_entry_ref: name = '%s', omode %x, perms %d, "
5557		"kernel %d\n", name, openMode, perms, kernel));
5558
5559	// get directory to put the new file in
5560	struct vnode* directory;
5561	status_t status = get_vnode(mountID, directoryID, &directory, true, false);
5562	if (status != B_OK)
5563		return status;
5564
5565	status = create_vnode(directory, name, openMode, perms, kernel);
5566	put_vnode(directory);
5567
5568	return status;
5569}
5570
5571
5572static int
5573file_create(int fd, char* path, int openMode, int perms, bool kernel)
5574{
5575	FUNCTION(("file_create: path '%s', omode %x, perms %d, kernel %d\n", path,
5576		openMode, perms, kernel));
5577
5578	// get directory to put the new file in
5579	char name[B_FILE_NAME_LENGTH];
5580	VnodePutter directory;
5581	status_t status = fd_and_path_to_dir_vnode(fd, path, directory, name,
5582		kernel);
5583	if (status < 0)
5584		return status;
5585
5586	return create_vnode(directory.Get(), name, openMode, perms, kernel);
5587}
5588
5589
5590static int
5591file_open_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5592	int openMode, bool kernel)
5593{
5594	if (name == NULL || *name == '\0')
5595		return B_BAD_VALUE;
5596
5597	FUNCTION(("file_open_entry_ref(ref = (%" B_PRId32 ", %" B_PRId64 ", %s), "
5598		"openMode = %d)\n", mountID, directoryID, name, openMode));
5599
5600	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5601
5602	// get the vnode matching the entry_ref
5603	VnodePutter vnode;
5604	status_t status = entry_ref_to_vnode(mountID, directoryID, name, traverse,
5605		kernel, vnode);
5606	if (status != B_OK)
5607		return status;
5608
5609	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type()))
5610		return B_LINK_LIMIT;
5611
5612	int newFD = open_vnode(vnode.Get(), openMode, kernel);
5613	if (newFD >= 0) {
5614		cache_node_opened(vnode.Get(), FDTYPE_FILE, vnode->cache, mountID,
5615			directoryID, vnode->id, name);
5616
5617		// The vnode reference has been transferred to the FD
5618		vnode.Detach();
5619	}
5620
5621	return newFD;
5622}
5623
5624
5625static int
5626file_open(int fd, char* path, int openMode, bool kernel)
5627{
5628	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5629
5630	FUNCTION(("file_open: fd: %d, entry path = '%s', omode %d, kernel %d\n",
5631		fd, path, openMode, kernel));
5632
5633	// get the vnode matching the vnode + path combination
5634	VnodePutter vnode;
5635	ino_t parentID;
5636	status_t status = fd_and_path_to_vnode(fd, path, traverse, vnode,
5637		&parentID, kernel);
5638	if (status != B_OK)
5639		return status;
5640
5641	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type()))
5642		return B_LINK_LIMIT;
5643
5644	// open the vnode
5645	int newFD = open_vnode(vnode.Get(), openMode, kernel);
5646	if (newFD >= 0) {
5647		cache_node_opened(vnode.Get(), FDTYPE_FILE, vnode->cache,
5648			vnode->device, parentID, vnode->id, NULL);
5649
5650		// The vnode reference has been transferred to the FD
5651		vnode.Detach();
5652	}
5653
5654	return newFD;
5655}
5656
5657
5658static status_t
5659file_close(struct file_descriptor* descriptor)
5660{
5661	struct vnode* vnode = descriptor->u.vnode;
5662	status_t status = B_OK;
5663
5664	FUNCTION(("file_close(descriptor = %p)\n", descriptor));
5665
5666	cache_node_closed(vnode, FDTYPE_FILE, vnode->cache, vnode->device,
5667		vnode->id);
5668	if (HAS_FS_CALL(vnode, close)) {
5669		status = FS_CALL(vnode, close, descriptor->cookie);
5670	}
5671
5672	if (status == B_OK) {
5673		// remove all outstanding locks for this team
5674		if (HAS_FS_CALL(vnode, release_lock))
5675			status = FS_CALL(vnode, release_lock, descriptor->cookie, NULL);
5676		else
5677			status = release_advisory_lock(vnode, NULL, descriptor, NULL);
5678	}
5679	return status;
5680}
5681
5682
5683static void
5684file_free_fd(struct file_descriptor* descriptor)
5685{
5686	struct vnode* vnode = descriptor->u.vnode;
5687
5688	if (vnode != NULL) {
5689		FS_CALL(vnode, free_cookie, descriptor->cookie);
5690		put_vnode(vnode);
5691	}
5692}
5693
5694
5695static status_t
5696file_read(struct file_descriptor* descriptor, off_t pos, void* buffer,
5697	size_t* length)
5698{
5699	struct vnode* vnode = descriptor->u.vnode;
5700	FUNCTION(("file_read: buf %p, pos %" B_PRIdOFF ", len %p = %ld\n", buffer,
5701		pos, length, *length));
5702
5703	if (S_ISDIR(vnode->Type()))
5704		return B_IS_A_DIRECTORY;
5705	if (pos != -1 && descriptor->pos == -1)
5706		return ESPIPE;
5707
5708	return FS_CALL(vnode, read, descriptor->cookie, pos, buffer, length);
5709}
5710
5711
5712static status_t
5713file_write(struct file_descriptor* descriptor, off_t pos, const void* buffer,
5714	size_t* length)
5715{
5716	struct vnode* vnode = descriptor->u.vnode;
5717	FUNCTION(("file_write: buf %p, pos %" B_PRIdOFF ", len %p\n", buffer, pos,
5718		length));
5719
5720	if (S_ISDIR(vnode->Type()))
5721		return B_IS_A_DIRECTORY;
5722	if (pos != -1 && descriptor->pos == -1)
5723		return ESPIPE;
5724
5725	if (!HAS_FS_CALL(vnode, write))
5726		return B_READ_ONLY_DEVICE;
5727
5728	return FS_CALL(vnode, write, descriptor->cookie, pos, buffer, length);
5729}
5730
5731
5732static off_t
5733file_seek(struct file_descriptor* descriptor, off_t pos, int seekType)
5734{
5735	struct vnode* vnode = descriptor->u.vnode;
5736	off_t offset;
5737	bool isDevice = false;
5738
5739	FUNCTION(("file_seek(pos = %" B_PRIdOFF ", seekType = %d)\n", pos,
5740		seekType));
5741
5742	if (descriptor->pos == -1)
5743		return ESPIPE;
5744
5745	switch (vnode->Type() & S_IFMT) {
5746		// drivers publish block devices as chr, so pick both
5747		case S_IFBLK:
5748		case S_IFCHR:
5749			isDevice = true;
5750			break;
5751	}
5752
5753	switch (seekType) {
5754		case SEEK_SET:
5755			offset = 0;
5756			break;
5757		case SEEK_CUR:
5758			offset = descriptor->pos;
5759			break;
5760		case SEEK_END:
5761		{
5762			// stat() the node
5763			if (!HAS_FS_CALL(vnode, read_stat))
5764				return B_UNSUPPORTED;
5765
5766			struct stat stat;
5767			status_t status = FS_CALL(vnode, read_stat, &stat);
5768			if (status != B_OK)
5769				return status;
5770
5771			offset = stat.st_size;
5772
5773			if (offset == 0 && isDevice) {
5774				// stat() on regular drivers doesn't report size
5775				device_geometry geometry;
5776
5777				if (HAS_FS_CALL(vnode, ioctl)) {
5778					status = FS_CALL(vnode, ioctl, descriptor->cookie,
5779						B_GET_GEOMETRY, &geometry, sizeof(geometry));
5780					if (status == B_OK)
5781						offset = (off_t)geometry.bytes_per_sector
5782							* geometry.sectors_per_track
5783							* geometry.cylinder_count
5784							* geometry.head_count;
5785				}
5786			}
5787
5788			break;
5789		}
5790		case SEEK_DATA:
5791		case SEEK_HOLE:
5792		{
5793			status_t status = B_BAD_VALUE;
5794			if (HAS_FS_CALL(vnode, ioctl)) {
5795				offset = pos;
5796				status = FS_CALL(vnode, ioctl, descriptor->cookie,
5797					seekType == SEEK_DATA ? FIOSEEKDATA : FIOSEEKHOLE,
5798					&offset, sizeof(offset));
5799				if (status == B_OK) {
5800					if (offset > pos)
5801						offset -= pos;
5802					break;
5803				}
5804			}
5805			if (status != B_BAD_VALUE && status != B_DEV_INVALID_IOCTL)
5806				return status;
5807
5808			// basic implementation with stat() the node
5809			if (!HAS_FS_CALL(vnode, read_stat) || isDevice)
5810				return B_BAD_VALUE;
5811
5812			struct stat stat;
5813			status = FS_CALL(vnode, read_stat, &stat);
5814			if (status != B_OK)
5815				return status;
5816
5817			off_t end = stat.st_size;
5818			if (pos >= end)
5819				return ENXIO;
5820			offset = seekType == SEEK_HOLE ? end - pos : 0;
5821			break;
5822		}
5823		default:
5824			return B_BAD_VALUE;
5825	}
5826
5827	// assumes off_t is 64 bits wide
5828	if (offset > 0 && LONGLONG_MAX - offset < pos)
5829		return B_BUFFER_OVERFLOW;
5830
5831	pos += offset;
5832	if (pos < 0)
5833		return B_BAD_VALUE;
5834
5835	return descriptor->pos = pos;
5836}
5837
5838
5839static status_t
5840file_select(struct file_descriptor* descriptor, uint8 event,
5841	struct selectsync* sync)
5842{
5843	FUNCTION(("file_select(%p, %u, %p)\n", descriptor, event, sync));
5844
5845	struct vnode* vnode = descriptor->u.vnode;
5846
5847	// If the FS has no select() hook, notify select() now.
5848	if (!HAS_FS_CALL(vnode, select)) {
5849		if (!SELECT_TYPE_IS_OUTPUT_ONLY(event))
5850			return notify_select_event(sync, event);
5851		else
5852			return B_OK;
5853	}
5854
5855	return FS_CALL(vnode, select, descriptor->cookie, event, sync);
5856}
5857
5858
5859static status_t
5860file_deselect(struct file_descriptor* descriptor, uint8 event,
5861	struct selectsync* sync)
5862{
5863	struct vnode* vnode = descriptor->u.vnode;
5864
5865	if (!HAS_FS_CALL(vnode, deselect))
5866		return B_OK;
5867
5868	return FS_CALL(vnode, deselect, descriptor->cookie, event, sync);
5869}
5870
5871
5872static status_t
5873dir_create_entry_ref(dev_t mountID, ino_t parentID, const char* name, int perms,
5874	bool kernel)
5875{
5876	struct vnode* vnode;
5877	status_t status;
5878
5879	if (name == NULL || *name == '\0')
5880		return B_BAD_VALUE;
5881
5882	FUNCTION(("dir_create_entry_ref(dev = %" B_PRId32 ", ino = %" B_PRId64 ", "
5883		"name = '%s', perms = %d)\n", mountID, parentID, name, perms));
5884
5885	status = get_vnode(mountID, parentID, &vnode, true, false);
5886	if (status != B_OK)
5887		return status;
5888
5889	if (HAS_FS_CALL(vnode, create_dir))
5890		status = FS_CALL(vnode, create_dir, name, perms);
5891	else
5892		status = B_READ_ONLY_DEVICE;
5893
5894	put_vnode(vnode);
5895	return status;
5896}
5897
5898
5899static status_t
5900dir_create(int fd, char* path, int perms, bool kernel)
5901{
5902	char filename[B_FILE_NAME_LENGTH];
5903	status_t status;
5904
5905	FUNCTION(("dir_create: path '%s', perms %d, kernel %d\n", path, perms,
5906		kernel));
5907
5908	VnodePutter vnode;
5909	status = fd_and_path_to_dir_vnode(fd, path, vnode, filename, kernel);
5910	if (status < 0)
5911		return status;
5912
5913	if (HAS_FS_CALL(vnode, create_dir)) {
5914		status = FS_CALL(vnode.Get(), create_dir, filename, perms);
5915	} else
5916		status = B_READ_ONLY_DEVICE;
5917
5918	return status;
5919}
5920
5921
5922static int
5923dir_open_entry_ref(dev_t mountID, ino_t parentID, const char* name, bool kernel)
5924{
5925	FUNCTION(("dir_open_entry_ref()\n"));
5926
5927	if (name && name[0] == '\0')
5928		return B_BAD_VALUE;
5929
5930	// get the vnode matching the entry_ref/node_ref
5931	VnodePutter vnode;
5932	status_t status;
5933	if (name) {
5934		status = entry_ref_to_vnode(mountID, parentID, name, true, kernel,
5935			vnode);
5936	} else {
5937		struct vnode* temp = NULL;
5938		status = get_vnode(mountID, parentID, &temp, true, false);
5939		vnode.SetTo(temp);
5940	}
5941	if (status != B_OK)
5942		return status;
5943
5944	int newFD = open_dir_vnode(vnode.Get(), kernel);
5945	if (newFD >= 0) {
5946		cache_node_opened(vnode.Get(), FDTYPE_DIR, vnode->cache, mountID, parentID,
5947			vnode->id, name);
5948
5949		// The vnode reference has been transferred to the FD
5950		vnode.Detach();
5951	}
5952
5953	return newFD;
5954}
5955
5956
5957static int
5958dir_open(int fd, char* path, bool kernel)
5959{
5960	FUNCTION(("dir_open: fd: %d, entry path = '%s', kernel %d\n", fd, path,
5961		kernel));
5962
5963	// get the vnode matching the vnode + path combination
5964	VnodePutter vnode;
5965	ino_t parentID;
5966	status_t status = fd_and_path_to_vnode(fd, path, true, vnode, &parentID,
5967		kernel);
5968	if (status != B_OK)
5969		return status;
5970
5971	// open the dir
5972	int newFD = open_dir_vnode(vnode.Get(), kernel);
5973	if (newFD >= 0) {
5974		cache_node_opened(vnode.Get(), FDTYPE_DIR, vnode->cache, vnode->device,
5975			parentID, vnode->id, NULL);
5976
5977		// The vnode reference has been transferred to the FD
5978		vnode.Detach();
5979	}
5980
5981	return newFD;
5982}
5983
5984
5985static status_t
5986dir_close(struct file_descriptor* descriptor)
5987{
5988	struct vnode* vnode = descriptor->u.vnode;
5989
5990	FUNCTION(("dir_close(descriptor = %p)\n", descriptor));
5991
5992	cache_node_closed(vnode, FDTYPE_DIR, vnode->cache, vnode->device,
5993		vnode->id);
5994	if (HAS_FS_CALL(vnode, close_dir))
5995		return FS_CALL(vnode, close_dir, descriptor->cookie);
5996
5997	return B_OK;
5998}
5999
6000
6001static void
6002dir_free_fd(struct file_descriptor* descriptor)
6003{
6004	struct vnode* vnode = descriptor->u.vnode;
6005
6006	if (vnode != NULL) {
6007		FS_CALL(vnode, free_dir_cookie, descriptor->cookie);
6008		put_vnode(vnode);
6009	}
6010}
6011
6012
6013static status_t
6014dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
6015	struct dirent* buffer, size_t bufferSize, uint32* _count)
6016{
6017	return dir_read(ioContext, descriptor->u.vnode, descriptor->cookie, buffer,
6018		bufferSize, _count);
6019}
6020
6021
6022static status_t
6023fix_dirent(struct vnode* parent, struct dirent* entry,
6024	struct io_context* ioContext)
6025{
6026	// set d_pdev and d_pino
6027	entry->d_pdev = parent->device;
6028	entry->d_pino = parent->id;
6029
6030	// If this is the ".." entry and the directory covering another vnode,
6031	// we need to replace d_dev and d_ino with the actual values.
6032	if (strcmp(entry->d_name, "..") == 0 && parent->IsCovering()) {
6033		return resolve_covered_parent(parent, &entry->d_dev, &entry->d_ino,
6034			ioContext);
6035	}
6036
6037	// resolve covered vnodes
6038	ReadLocker _(&sVnodeLock);
6039
6040	struct vnode* vnode = lookup_vnode(entry->d_dev, entry->d_ino);
6041	if (vnode != NULL && vnode->covered_by != NULL) {
6042		do {
6043			vnode = vnode->covered_by;
6044		} while (vnode->covered_by != NULL);
6045
6046		entry->d_dev = vnode->device;
6047		entry->d_ino = vnode->id;
6048	}
6049
6050	return B_OK;
6051}
6052
6053
6054static status_t
6055dir_read(struct io_context* ioContext, struct vnode* vnode, void* cookie,
6056	struct dirent* buffer, size_t bufferSize, uint32* _count)
6057{
6058	if (!HAS_FS_CALL(vnode, read_dir))
6059		return B_UNSUPPORTED;
6060
6061	status_t error = FS_CALL(vnode, read_dir, cookie, buffer, bufferSize,
6062		_count);
6063	if (error != B_OK)
6064		return error;
6065
6066	// we need to adjust the read dirents
6067	uint32 count = *_count;
6068	for (uint32 i = 0; i < count; i++) {
6069		error = fix_dirent(vnode, buffer, ioContext);
6070		if (error != B_OK)
6071			return error;
6072
6073		buffer = (struct dirent*)((uint8*)buffer + buffer->d_reclen);
6074	}
6075
6076	return error;
6077}
6078
6079
6080static status_t
6081dir_rewind(struct file_descriptor* descriptor)
6082{
6083	struct vnode* vnode = descriptor->u.vnode;
6084
6085	if (HAS_FS_CALL(vnode, rewind_dir)) {
6086		return FS_CALL(vnode, rewind_dir, descriptor->cookie);
6087	}
6088
6089	return B_UNSUPPORTED;
6090}
6091
6092
6093static status_t
6094dir_remove(int fd, char* path, bool kernel)
6095{
6096	char name[B_FILE_NAME_LENGTH];
6097	status_t status;
6098
6099	if (path != NULL) {
6100		// we need to make sure our path name doesn't stop with "/", ".",
6101		// or ".."
6102		char* lastSlash;
6103		while ((lastSlash = strrchr(path, '/')) != NULL) {
6104			char* leaf = lastSlash + 1;
6105			if (!strcmp(leaf, ".."))
6106				return B_NOT_ALLOWED;
6107
6108			// omit multiple slashes
6109			while (lastSlash > path && lastSlash[-1] == '/')
6110				lastSlash--;
6111
6112			if (leaf[0]
6113				&& strcmp(leaf, ".")) {
6114				break;
6115			}
6116			// "name/" -> "name", or "name/." -> "name"
6117			lastSlash[0] = '\0';
6118		}
6119
6120		if (!strcmp(path, ".") || !strcmp(path, ".."))
6121			return B_NOT_ALLOWED;
6122	}
6123
6124	VnodePutter directory;
6125	status = fd_and_path_to_dir_vnode(fd, path, directory, name, kernel);
6126	if (status != B_OK)
6127		return status;
6128
6129	if (HAS_FS_CALL(directory, remove_dir))
6130		status = FS_CALL(directory.Get(), remove_dir, name);
6131	else
6132		status = B_READ_ONLY_DEVICE;
6133
6134	return status;
6135}
6136
6137
6138static status_t
6139common_ioctl(struct file_descriptor* descriptor, ulong op, void* buffer,
6140	size_t length)
6141{
6142	struct vnode* vnode = descriptor->u.vnode;
6143
6144	if (HAS_FS_CALL(vnode, ioctl))
6145		return FS_CALL(vnode, ioctl, descriptor->cookie, op, buffer, length);
6146
6147	return B_DEV_INVALID_IOCTL;
6148}
6149
6150
6151static status_t
6152common_fcntl(int fd, int op, size_t argument, bool kernel)
6153{
6154	struct flock flock;
6155
6156	FUNCTION(("common_fcntl(fd = %d, op = %d, argument = %lx, %s)\n",
6157		fd, op, argument, kernel ? "kernel" : "user"));
6158
6159	struct io_context* context = get_current_io_context(kernel);
6160
6161	FileDescriptorPutter descriptor(get_fd(context, fd));
6162	if (!descriptor.IsSet())
6163		return B_FILE_ERROR;
6164
6165	struct vnode* vnode = fd_vnode(descriptor.Get());
6166
6167	status_t status = B_OK;
6168
6169	if (op == F_SETLK || op == F_SETLKW || op == F_GETLK) {
6170		if (descriptor->type != FDTYPE_FILE)
6171			status = B_BAD_VALUE;
6172		else if (kernel)
6173			memcpy(&flock, (struct flock*)argument, sizeof(struct flock));
6174		else if (user_memcpy(&flock, (struct flock*)argument,
6175				sizeof(struct flock)) != B_OK)
6176			status = B_BAD_ADDRESS;
6177		if (status != B_OK)
6178			return status;
6179	}
6180
6181	switch (op) {
6182		case F_SETFD:
6183		{
6184			// Set file descriptor flags
6185
6186			// O_CLOEXEC is the only flag available at this time
6187			mutex_lock(&context->io_mutex);
6188			fd_set_close_on_exec(context, fd, (argument & FD_CLOEXEC) != 0);
6189			mutex_unlock(&context->io_mutex);
6190
6191			status = B_OK;
6192			break;
6193		}
6194
6195		case F_GETFD:
6196		{
6197			// Get file descriptor flags
6198			mutex_lock(&context->io_mutex);
6199			status = fd_close_on_exec(context, fd) ? FD_CLOEXEC : 0;
6200			mutex_unlock(&context->io_mutex);
6201			break;
6202		}
6203
6204		case F_SETFL:
6205			// Set file descriptor open mode
6206
6207			// we only accept changes to O_APPEND and O_NONBLOCK
6208			argument &= O_APPEND | O_NONBLOCK;
6209			if (descriptor->ops->fd_set_flags != NULL) {
6210				status = descriptor->ops->fd_set_flags(descriptor.Get(), argument);
6211			} else if (vnode != NULL && HAS_FS_CALL(vnode, set_flags)) {
6212				status = FS_CALL(vnode, set_flags, descriptor->cookie,
6213					(int)argument);
6214			} else
6215				status = B_UNSUPPORTED;
6216
6217			if (status == B_OK) {
6218				// update this descriptor's open_mode field
6219				descriptor->open_mode = (descriptor->open_mode
6220					& ~(O_APPEND | O_NONBLOCK)) | argument;
6221			}
6222
6223			break;
6224
6225		case F_GETFL:
6226			// Get file descriptor open mode
6227			status = descriptor->open_mode;
6228			break;
6229
6230		case F_DUPFD:
6231		case F_DUPFD_CLOEXEC:
6232		{
6233			status = new_fd_etc(context, descriptor.Get(), (int)argument);
6234			if (status >= 0) {
6235				mutex_lock(&context->io_mutex);
6236				fd_set_close_on_exec(context, status, op == F_DUPFD_CLOEXEC);
6237				mutex_unlock(&context->io_mutex);
6238
6239				atomic_add(&descriptor->ref_count, 1);
6240			}
6241			break;
6242		}
6243
6244		case F_GETLK:
6245			if (vnode != NULL) {
6246				struct flock normalizedLock;
6247
6248				memcpy(&normalizedLock, &flock, sizeof(struct flock));
6249				status = normalize_flock(descriptor.Get(), &normalizedLock);
6250				if (status != B_OK)
6251					break;
6252
6253				if (HAS_FS_CALL(vnode, test_lock)) {
6254					status = FS_CALL(vnode, test_lock, descriptor->cookie,
6255						&normalizedLock);
6256				} else
6257					status = test_advisory_lock(vnode, &normalizedLock);
6258				if (status == B_OK) {
6259					if (normalizedLock.l_type == F_UNLCK) {
6260						// no conflicting lock found, copy back the same struct
6261						// we were given except change type to F_UNLCK
6262						flock.l_type = F_UNLCK;
6263						if (kernel) {
6264							memcpy((struct flock*)argument, &flock,
6265								sizeof(struct flock));
6266						} else {
6267							status = user_memcpy((struct flock*)argument,
6268								&flock, sizeof(struct flock));
6269						}
6270					} else {
6271						// a conflicting lock was found, copy back its range and
6272						// type
6273						if (normalizedLock.l_len == OFF_MAX)
6274							normalizedLock.l_len = 0;
6275
6276						if (kernel) {
6277							memcpy((struct flock*)argument,
6278								&normalizedLock, sizeof(struct flock));
6279						} else {
6280							status = user_memcpy((struct flock*)argument,
6281								&normalizedLock, sizeof(struct flock));
6282						}
6283					}
6284				}
6285			} else
6286				status = B_BAD_VALUE;
6287			break;
6288
6289		case F_SETLK:
6290		case F_SETLKW:
6291			status = normalize_flock(descriptor.Get(), &flock);
6292			if (status != B_OK)
6293				break;
6294
6295			if (vnode == NULL) {
6296				status = B_BAD_VALUE;
6297			} else if (flock.l_type == F_UNLCK) {
6298				if (HAS_FS_CALL(vnode, release_lock)) {
6299					status = FS_CALL(vnode, release_lock, descriptor->cookie,
6300						&flock);
6301				} else {
6302					status = release_advisory_lock(vnode, context, NULL,
6303						&flock);
6304				}
6305			} else {
6306				// the open mode must match the lock type
6307				if (((descriptor->open_mode & O_RWMASK) == O_RDONLY
6308						&& flock.l_type == F_WRLCK)
6309					|| ((descriptor->open_mode & O_RWMASK) == O_WRONLY
6310						&& flock.l_type == F_RDLCK))
6311					status = B_FILE_ERROR;
6312				else {
6313					if (HAS_FS_CALL(vnode, acquire_lock)) {
6314						status = FS_CALL(vnode, acquire_lock,
6315							descriptor->cookie, &flock, op == F_SETLKW);
6316					} else {
6317						status = acquire_advisory_lock(vnode, context, NULL,
6318							&flock, op == F_SETLKW);
6319					}
6320				}
6321			}
6322			break;
6323
6324		// ToDo: add support for more ops?
6325
6326		default:
6327			status = B_BAD_VALUE;
6328	}
6329
6330	return status;
6331}
6332
6333
6334static status_t
6335common_sync(int fd, bool kernel)
6336{
6337	FUNCTION(("common_fsync: entry. fd %d kernel %d\n", fd, kernel));
6338
6339	struct vnode* vnode;
6340	FileDescriptorPutter descriptor(get_fd_and_vnode(fd, &vnode, kernel));
6341	if (!descriptor.IsSet())
6342		return B_FILE_ERROR;
6343
6344	status_t status;
6345	if (HAS_FS_CALL(vnode, fsync))
6346		status = FS_CALL_NO_PARAMS(vnode, fsync);
6347	else
6348		status = B_UNSUPPORTED;
6349
6350	return status;
6351}
6352
6353
6354static status_t
6355common_lock_node(int fd, bool kernel)
6356{
6357	struct vnode* vnode;
6358	FileDescriptorPutter descriptor(get_fd_and_vnode(fd, &vnode, kernel));
6359	if (!descriptor.IsSet())
6360		return B_FILE_ERROR;
6361
6362	status_t status = B_OK;
6363
6364	// We need to set the locking atomically - someone
6365	// else might set one at the same time
6366	if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by,
6367			descriptor.Get(), (file_descriptor*)NULL) != NULL)
6368		status = B_BUSY;
6369
6370	return status;
6371}
6372
6373
6374static status_t
6375common_unlock_node(int fd, bool kernel)
6376{
6377	struct vnode* vnode;
6378	FileDescriptorPutter descriptor(get_fd_and_vnode(fd, &vnode, kernel));
6379	if (!descriptor.IsSet())
6380		return B_FILE_ERROR;
6381
6382	status_t status = B_OK;
6383
6384	// We need to set the locking atomically - someone
6385	// else might set one at the same time
6386	if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by,
6387			(file_descriptor*)NULL, descriptor.Get()) != descriptor.Get())
6388		status = B_BAD_VALUE;
6389
6390	return status;
6391}
6392
6393
6394static status_t
6395common_preallocate(int fd, off_t offset, off_t length, bool kernel)
6396{
6397	if (offset < 0 || length == 0)
6398		return B_BAD_VALUE;
6399	if (offset > OFF_MAX - length)
6400		return B_FILE_TOO_LARGE;
6401
6402	struct vnode* vnode;
6403	FileDescriptorPutter descriptor(get_fd_and_vnode(fd, &vnode, kernel));
6404	if (!descriptor.IsSet() || (descriptor->open_mode & O_RWMASK) == O_RDONLY)
6405		return B_FILE_ERROR;
6406
6407	switch (vnode->Type() & S_IFMT) {
6408		case S_IFIFO:
6409		case S_IFSOCK:
6410			return ESPIPE;
6411
6412		case S_IFBLK:
6413		case S_IFCHR:
6414		case S_IFDIR:
6415		case S_IFLNK:
6416			return B_DEVICE_NOT_FOUND;
6417
6418		case S_IFREG:
6419			break;
6420	}
6421
6422	status_t status = B_OK;
6423	if (HAS_FS_CALL(vnode, preallocate)) {
6424		status = FS_CALL(vnode, preallocate, offset, length);
6425	} else {
6426		status = HAS_FS_CALL(vnode, write)
6427			? B_UNSUPPORTED : B_READ_ONLY_DEVICE;
6428	}
6429
6430	return status;
6431}
6432
6433
6434static status_t
6435common_read_link(int fd, char* path, char* buffer, size_t* _bufferSize,
6436	bool kernel)
6437{
6438	VnodePutter vnode;
6439	status_t status;
6440
6441	status = fd_and_path_to_vnode(fd, path, false, vnode, NULL, kernel);
6442	if (status != B_OK)
6443		return status;
6444
6445	if (HAS_FS_CALL(vnode, read_symlink)) {
6446		status = FS_CALL(vnode.Get(), read_symlink, buffer, _bufferSize);
6447	} else
6448		status = B_BAD_VALUE;
6449
6450	return status;
6451}
6452
6453
6454static status_t
6455common_create_symlink(int fd, char* path, const char* toPath, int mode,
6456	bool kernel)
6457{
6458	// path validity checks have to be in the calling function!
6459	char name[B_FILE_NAME_LENGTH];
6460	status_t status;
6461
6462	FUNCTION(("common_create_symlink(fd = %d, path = %s, toPath = %s, "
6463		"mode = %d, kernel = %d)\n", fd, path, toPath, mode, kernel));
6464
6465	VnodePutter vnode;
6466	status = fd_and_path_to_dir_vnode(fd, path, vnode, name, kernel);
6467	if (status != B_OK)
6468		return status;
6469
6470	if (HAS_FS_CALL(vnode, create_symlink))
6471		status = FS_CALL(vnode.Get(), create_symlink, name, toPath, mode);
6472	else {
6473		status = HAS_FS_CALL(vnode, write)
6474			? B_UNSUPPORTED : B_READ_ONLY_DEVICE;
6475	}
6476
6477	return status;
6478}
6479
6480
6481static status_t
6482common_create_link(int pathFD, char* path, int toFD, char* toPath,
6483	bool traverseLeafLink, bool kernel)
6484{
6485	// path validity checks have to be in the calling function!
6486
6487	FUNCTION(("common_create_link(path = %s, toPath = %s, kernel = %d)\n", path,
6488		toPath, kernel));
6489
6490	char name[B_FILE_NAME_LENGTH];
6491	VnodePutter directory;
6492	status_t status = fd_and_path_to_dir_vnode(pathFD, path, directory, name,
6493		kernel);
6494	if (status != B_OK)
6495		return status;
6496
6497	VnodePutter vnode;
6498	status = fd_and_path_to_vnode(toFD, toPath, traverseLeafLink, vnode, NULL,
6499		kernel);
6500	if (status != B_OK)
6501		return status;
6502
6503	if (directory->mount != vnode->mount)
6504		return B_CROSS_DEVICE_LINK;
6505
6506	if (HAS_FS_CALL(directory, link))
6507		status = FS_CALL(directory.Get(), link, name, vnode.Get());
6508	else
6509		status = B_READ_ONLY_DEVICE;
6510
6511	return status;
6512}
6513
6514
6515static status_t
6516common_unlink(int fd, char* path, bool kernel)
6517{
6518	char filename[B_FILE_NAME_LENGTH];
6519	status_t status;
6520
6521	FUNCTION(("common_unlink: fd: %d, path '%s', kernel %d\n", fd, path,
6522		kernel));
6523
6524	VnodePutter vnode;
6525	status = fd_and_path_to_dir_vnode(fd, path, vnode, filename, kernel);
6526	if (status < 0)
6527		return status;
6528
6529	if (HAS_FS_CALL(vnode, unlink))
6530		status = FS_CALL(vnode.Get(), unlink, filename);
6531	else
6532		status = B_READ_ONLY_DEVICE;
6533
6534	return status;
6535}
6536
6537
6538static status_t
6539common_access(int fd, char* path, int mode, bool effectiveUserGroup, bool kernel)
6540{
6541	status_t status;
6542
6543	// TODO: honor effectiveUserGroup argument
6544
6545	VnodePutter vnode;
6546	status = fd_and_path_to_vnode(fd, path, true, vnode, NULL, kernel);
6547	if (status != B_OK)
6548		return status;
6549
6550	if (HAS_FS_CALL(vnode, access))
6551		status = FS_CALL(vnode.Get(), access, mode);
6552	else
6553		status = B_OK;
6554
6555	return status;
6556}
6557
6558
6559static status_t
6560common_rename(int fd, char* path, int newFD, char* newPath, bool kernel)
6561{
6562	status_t status;
6563
6564	FUNCTION(("common_rename(fd = %d, path = %s, newFD = %d, newPath = %s, "
6565		"kernel = %d)\n", fd, path, newFD, newPath, kernel));
6566
6567	VnodePutter fromVnode;
6568	char fromName[B_FILE_NAME_LENGTH];
6569	status = fd_and_path_to_dir_vnode(fd, path, fromVnode, fromName, kernel);
6570	if (status != B_OK)
6571		return status;
6572
6573	VnodePutter toVnode;
6574	char toName[B_FILE_NAME_LENGTH];
6575	status = fd_and_path_to_dir_vnode(newFD, newPath, toVnode, toName, kernel);
6576	if (status != B_OK)
6577		return status;
6578
6579	if (fromVnode->device != toVnode->device)
6580		return B_CROSS_DEVICE_LINK;
6581
6582	if (fromVnode.Get() == toVnode.Get() && !strcmp(fromName, toName))
6583		return B_OK;
6584
6585	if (fromName[0] == '\0' || toName[0] == '\0'
6586		|| !strcmp(fromName, ".") || !strcmp(fromName, "..")
6587		|| !strcmp(toName, ".") || !strcmp(toName, "..")) {
6588		return B_BAD_VALUE;
6589	}
6590
6591	if (HAS_FS_CALL(fromVnode, rename))
6592		status = FS_CALL(fromVnode.Get(), rename, fromName, toVnode.Get(), toName);
6593	else
6594		status = B_READ_ONLY_DEVICE;
6595
6596	return status;
6597}
6598
6599
6600static status_t
6601common_read_stat(struct file_descriptor* descriptor, struct stat* stat)
6602{
6603	struct vnode* vnode = descriptor->u.vnode;
6604
6605	FUNCTION(("common_read_stat: stat %p\n", stat));
6606
6607	// TODO: remove this once all file systems properly set them!
6608	stat->st_crtim.tv_nsec = 0;
6609	stat->st_ctim.tv_nsec = 0;
6610	stat->st_mtim.tv_nsec = 0;
6611	stat->st_atim.tv_nsec = 0;
6612
6613	return vfs_stat_vnode(vnode, stat);
6614}
6615
6616
6617static status_t
6618common_write_stat(struct file_descriptor* descriptor, const struct stat* stat,
6619	int statMask)
6620{
6621	struct vnode* vnode = descriptor->u.vnode;
6622
6623	FUNCTION(("common_write_stat(vnode = %p, stat = %p, statMask = %d)\n",
6624		vnode, stat, statMask));
6625
6626	if ((descriptor->open_mode & O_RWMASK) == O_RDONLY
6627		&& (statMask & B_STAT_SIZE) != 0) {
6628		return B_BAD_VALUE;
6629	}
6630
6631	if (!HAS_FS_CALL(vnode, write_stat))
6632		return B_READ_ONLY_DEVICE;
6633
6634	return FS_CALL(vnode, write_stat, stat, statMask);
6635}
6636
6637
6638static status_t
6639common_path_read_stat(int fd, char* path, bool traverseLeafLink,
6640	struct stat* stat, bool kernel)
6641{
6642	FUNCTION(("common_path_read_stat: fd: %d, path '%s', stat %p,\n", fd, path,
6643		stat));
6644
6645	VnodePutter vnode;
6646	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, vnode,
6647		NULL, kernel);
6648	if (status != B_OK)
6649		return status;
6650
6651	status = vfs_stat_vnode(vnode.Get(), stat);
6652
6653	return status;
6654}
6655
6656
6657static status_t
6658common_path_write_stat(int fd, char* path, bool traverseLeafLink,
6659	const struct stat* stat, int statMask, bool kernel)
6660{
6661	FUNCTION(("common_write_stat: fd: %d, path '%s', stat %p, stat_mask %d, "
6662		"kernel %d\n", fd, path, stat, statMask, kernel));
6663
6664	VnodePutter vnode;
6665	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, vnode,
6666		NULL, kernel);
6667	if (status != B_OK)
6668		return status;
6669
6670	if (HAS_FS_CALL(vnode, write_stat))
6671		status = FS_CALL(vnode.Get(), write_stat, stat, statMask);
6672	else
6673		status = B_READ_ONLY_DEVICE;
6674
6675	return status;
6676}
6677
6678
6679static int
6680attr_dir_open(int fd, char* path, bool traverseLeafLink, bool kernel)
6681{
6682	FUNCTION(("attr_dir_open(fd = %d, path = '%s', kernel = %d)\n", fd, path,
6683		kernel));
6684
6685	VnodePutter vnode;
6686	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, vnode,
6687		NULL, kernel);
6688	if (status != B_OK)
6689		return status;
6690
6691	status = open_attr_dir_vnode(vnode.Get(), kernel);
6692	if (status >= 0)
6693		vnode.Detach();
6694
6695	return status;
6696}
6697
6698
6699static status_t
6700attr_dir_close(struct file_descriptor* descriptor)
6701{
6702	struct vnode* vnode = descriptor->u.vnode;
6703
6704	FUNCTION(("attr_dir_close(descriptor = %p)\n", descriptor));
6705
6706	if (HAS_FS_CALL(vnode, close_attr_dir))
6707		return FS_CALL(vnode, close_attr_dir, descriptor->cookie);
6708
6709	return B_OK;
6710}
6711
6712
6713static void
6714attr_dir_free_fd(struct file_descriptor* descriptor)
6715{
6716	struct vnode* vnode = descriptor->u.vnode;
6717
6718	if (vnode != NULL) {
6719		FS_CALL(vnode, free_attr_dir_cookie, descriptor->cookie);
6720		put_vnode(vnode);
6721	}
6722}
6723
6724
6725static status_t
6726attr_dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
6727	struct dirent* buffer, size_t bufferSize, uint32* _count)
6728{
6729	struct vnode* vnode = descriptor->u.vnode;
6730
6731	FUNCTION(("attr_dir_read(descriptor = %p)\n", descriptor));
6732
6733	if (HAS_FS_CALL(vnode, read_attr_dir))
6734		return FS_CALL(vnode, read_attr_dir, descriptor->cookie, buffer,
6735			bufferSize, _count);
6736
6737	return B_UNSUPPORTED;
6738}
6739
6740
6741static status_t
6742attr_dir_rewind(struct file_descriptor* descriptor)
6743{
6744	struct vnode* vnode = descriptor->u.vnode;
6745
6746	FUNCTION(("attr_dir_rewind(descriptor = %p)\n", descriptor));
6747
6748	if (HAS_FS_CALL(vnode, rewind_attr_dir))
6749		return FS_CALL(vnode, rewind_attr_dir, descriptor->cookie);
6750
6751	return B_UNSUPPORTED;
6752}
6753
6754
6755static int
6756attr_create(int fd, char* path, const char* name, uint32 type,
6757	int openMode, bool kernel)
6758{
6759	if (name == NULL || *name == '\0')
6760		return B_BAD_VALUE;
6761
6762	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
6763	VnodePutter vnode;
6764	status_t status = fd_and_path_to_vnode(fd, path, traverse, vnode, NULL,
6765		kernel);
6766	if (status != B_OK)
6767		return status;
6768
6769	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type()))
6770		return B_LINK_LIMIT;
6771
6772	if (!HAS_FS_CALL(vnode, create_attr))
6773		return B_READ_ONLY_DEVICE;
6774
6775	void* cookie;
6776	status = FS_CALL(vnode.Get(), create_attr, name, type, openMode, &cookie);
6777	if (status != B_OK)
6778		return status;
6779
6780	fd = get_new_fd(FDTYPE_ATTR, NULL, vnode.Get(), cookie, openMode, kernel);
6781	if (fd >= 0) {
6782		vnode.Detach();
6783		return fd;
6784	}
6785
6786	status = fd;
6787
6788	FS_CALL(vnode.Get(), close_attr, cookie);
6789	FS_CALL(vnode.Get(), free_attr_cookie, cookie);
6790
6791	FS_CALL(vnode.Get(), remove_attr, name);
6792
6793	return status;
6794}
6795
6796
6797static int
6798attr_open(int fd, char* path, const char* name, int openMode, bool kernel)
6799{
6800	if (name == NULL || *name == '\0')
6801		return B_BAD_VALUE;
6802
6803	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
6804	VnodePutter vnode;
6805	status_t status = fd_and_path_to_vnode(fd, path, traverse, vnode, NULL,
6806		kernel);
6807	if (status != B_OK)
6808		return status;
6809
6810	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type()))
6811		return B_LINK_LIMIT;
6812
6813	if (!HAS_FS_CALL(vnode, open_attr))
6814		return B_UNSUPPORTED;
6815
6816	void* cookie;
6817	status = FS_CALL(vnode.Get(), open_attr, name, openMode, &cookie);
6818	if (status != B_OK)
6819		return status;
6820
6821	// now we only need a file descriptor for this attribute and we're done
6822	fd = get_new_fd(FDTYPE_ATTR, NULL, vnode.Get(), cookie, openMode, kernel);
6823	if (fd >= 0) {
6824		vnode.Detach();
6825		return fd;
6826	}
6827
6828	status = fd;
6829
6830	FS_CALL(vnode.Get(), close_attr, cookie);
6831	FS_CALL(vnode.Get(), free_attr_cookie, cookie);
6832
6833	return status;
6834}
6835
6836
6837static status_t
6838attr_close(struct file_descriptor* descriptor)
6839{
6840	struct vnode* vnode = descriptor->u.vnode;
6841
6842	FUNCTION(("attr_close(descriptor = %p)\n", descriptor));
6843
6844	if (HAS_FS_CALL(vnode, close_attr))
6845		return FS_CALL(vnode, close_attr, descriptor->cookie);
6846
6847	return B_OK;
6848}
6849
6850
6851static void
6852attr_free_fd(struct file_descriptor* descriptor)
6853{
6854	struct vnode* vnode = descriptor->u.vnode;
6855
6856	if (vnode != NULL) {
6857		FS_CALL(vnode, free_attr_cookie, descriptor->cookie);
6858		put_vnode(vnode);
6859	}
6860}
6861
6862
6863static status_t
6864attr_read(struct file_descriptor* descriptor, off_t pos, void* buffer,
6865	size_t* length)
6866{
6867	struct vnode* vnode = descriptor->u.vnode;
6868
6869	FUNCTION(("attr_read: buf %p, pos %" B_PRIdOFF ", len %p = %ld\n", buffer,
6870		pos, length, *length));
6871
6872	if (!HAS_FS_CALL(vnode, read_attr))
6873		return B_UNSUPPORTED;
6874
6875	return FS_CALL(vnode, read_attr, descriptor->cookie, pos, buffer, length);
6876}
6877
6878
6879static status_t
6880attr_write(struct file_descriptor* descriptor, off_t pos, const void* buffer,
6881	size_t* length)
6882{
6883	struct vnode* vnode = descriptor->u.vnode;
6884
6885	FUNCTION(("attr_write: buf %p, pos %" B_PRIdOFF ", len %p\n", buffer, pos,
6886		length));
6887
6888	if (!HAS_FS_CALL(vnode, write_attr))
6889		return B_UNSUPPORTED;
6890
6891	return FS_CALL(vnode, write_attr, descriptor->cookie, pos, buffer, length);
6892}
6893
6894
6895static off_t
6896attr_seek(struct file_descriptor* descriptor, off_t pos, int seekType)
6897{
6898	off_t offset;
6899
6900	switch (seekType) {
6901		case SEEK_SET:
6902			offset = 0;
6903			break;
6904		case SEEK_CUR:
6905			offset = descriptor->pos;
6906			break;
6907		case SEEK_END:
6908		{
6909			struct vnode* vnode = descriptor->u.vnode;
6910			if (!HAS_FS_CALL(vnode, read_stat))
6911				return B_UNSUPPORTED;
6912
6913			struct stat stat;
6914			status_t status = FS_CALL(vnode, read_attr_stat, descriptor->cookie,
6915				&stat);
6916			if (status != B_OK)
6917				return status;
6918
6919			offset = stat.st_size;
6920			break;
6921		}
6922		default:
6923			return B_BAD_VALUE;
6924	}
6925
6926	// assumes off_t is 64 bits wide
6927	if (offset > 0 && LONGLONG_MAX - offset < pos)
6928		return B_BUFFER_OVERFLOW;
6929
6930	pos += offset;
6931	if (pos < 0)
6932		return B_BAD_VALUE;
6933
6934	return descriptor->pos = pos;
6935}
6936
6937
6938static status_t
6939attr_read_stat(struct file_descriptor* descriptor, struct stat* stat)
6940{
6941	struct vnode* vnode = descriptor->u.vnode;
6942
6943	FUNCTION(("attr_read_stat: stat 0x%p\n", stat));
6944
6945	if (!HAS_FS_CALL(vnode, read_attr_stat))
6946		return B_UNSUPPORTED;
6947
6948	return FS_CALL(vnode, read_attr_stat, descriptor->cookie, stat);
6949}
6950
6951
6952static status_t
6953attr_write_stat(struct file_descriptor* descriptor, const struct stat* stat,
6954	int statMask)
6955{
6956	struct vnode* vnode = descriptor->u.vnode;
6957
6958	FUNCTION(("attr_write_stat: stat = %p, statMask %d\n", stat, statMask));
6959
6960	if (!HAS_FS_CALL(vnode, write_attr_stat))
6961		return B_READ_ONLY_DEVICE;
6962
6963	return FS_CALL(vnode, write_attr_stat, descriptor->cookie, stat, statMask);
6964}
6965
6966
6967static status_t
6968attr_remove(int fd, const char* name, bool kernel)
6969{
6970	if (name == NULL || *name == '\0')
6971		return B_BAD_VALUE;
6972
6973	FUNCTION(("attr_remove: fd = %d, name = \"%s\", kernel %d\n", fd, name,
6974		kernel));
6975
6976	struct vnode* vnode;
6977	FileDescriptorPutter descriptor(get_fd_and_vnode(fd, &vnode, kernel));
6978	if (!descriptor.IsSet())
6979		return B_FILE_ERROR;
6980
6981	status_t status;
6982	if (HAS_FS_CALL(vnode, remove_attr))
6983		status = FS_CALL(vnode, remove_attr, name);
6984	else
6985		status = B_READ_ONLY_DEVICE;
6986
6987	return status;
6988}
6989
6990
6991static status_t
6992attr_rename(int fromFD, const char* fromName, int toFD, const char* toName,
6993	bool kernel)
6994{
6995	if (fromName == NULL || *fromName == '\0' || toName == NULL
6996		|| *toName == '\0')
6997		return B_BAD_VALUE;
6998
6999	FUNCTION(("attr_rename: from fd = %d, from name = \"%s\", to fd = %d, to "
7000		"name = \"%s\", kernel %d\n", fromFD, fromName, toFD, toName, kernel));
7001
7002	struct vnode* fromVnode;
7003	FileDescriptorPutter fromDescriptor(get_fd_and_vnode(fromFD, &fromVnode, kernel));
7004	if (!fromDescriptor.IsSet())
7005		return B_FILE_ERROR;
7006
7007	struct vnode* toVnode;
7008	FileDescriptorPutter toDescriptor(get_fd_and_vnode(toFD, &toVnode, kernel));
7009	if (!toDescriptor.IsSet())
7010		return B_FILE_ERROR;
7011
7012	// are the files on the same volume?
7013	if (fromVnode->device != toVnode->device)
7014		return B_CROSS_DEVICE_LINK;
7015
7016	status_t status;
7017	if (HAS_FS_CALL(fromVnode, rename_attr)) {
7018		status = FS_CALL(fromVnode, rename_attr, fromName, toVnode, toName);
7019	} else
7020		status = B_READ_ONLY_DEVICE;
7021
7022	return status;
7023}
7024
7025
7026static int
7027index_dir_open(dev_t mountID, bool kernel)
7028{
7029	struct fs_mount* mount;
7030	void* cookie;
7031
7032	FUNCTION(("index_dir_open(mountID = %" B_PRId32 ", kernel = %d)\n", mountID,
7033		kernel));
7034
7035	status_t status = get_mount(mountID, &mount);
7036	if (status != B_OK)
7037		return status;
7038
7039	if (!HAS_FS_MOUNT_CALL(mount, open_index_dir)) {
7040		status = B_UNSUPPORTED;
7041		goto error;
7042	}
7043
7044	status = FS_MOUNT_CALL(mount, open_index_dir, &cookie);
7045	if (status != B_OK)
7046		goto error;
7047
7048	// get fd for the index directory
7049	int fd;
7050	fd = get_new_fd(FDTYPE_INDEX_DIR, mount, NULL, cookie, O_CLOEXEC, kernel);
7051	if (fd >= 0)
7052		return fd;
7053
7054	// something went wrong
7055	FS_MOUNT_CALL(mount, close_index_dir, cookie);
7056	FS_MOUNT_CALL(mount, free_index_dir_cookie, cookie);
7057
7058	status = fd;
7059
7060error:
7061	put_mount(mount);
7062	return status;
7063}
7064
7065
7066static status_t
7067index_dir_close(struct file_descriptor* descriptor)
7068{
7069	struct fs_mount* mount = descriptor->u.mount;
7070
7071	FUNCTION(("index_dir_close(descriptor = %p)\n", descriptor));
7072
7073	if (HAS_FS_MOUNT_CALL(mount, close_index_dir))
7074		return FS_MOUNT_CALL(mount, close_index_dir, descriptor->cookie);
7075
7076	return B_OK;
7077}
7078
7079
7080static void
7081index_dir_free_fd(struct file_descriptor* descriptor)
7082{
7083	struct fs_mount* mount = descriptor->u.mount;
7084
7085	if (mount != NULL) {
7086		FS_MOUNT_CALL(mount, free_index_dir_cookie, descriptor->cookie);
7087		put_mount(mount);
7088	}
7089}
7090
7091
7092static status_t
7093index_dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
7094	struct dirent* buffer, size_t bufferSize, uint32* _count)
7095{
7096	struct fs_mount* mount = descriptor->u.mount;
7097
7098	if (HAS_FS_MOUNT_CALL(mount, read_index_dir)) {
7099		return FS_MOUNT_CALL(mount, read_index_dir, descriptor->cookie, buffer,
7100			bufferSize, _count);
7101	}
7102
7103	return B_UNSUPPORTED;
7104}
7105
7106
7107static status_t
7108index_dir_rewind(struct file_descriptor* descriptor)
7109{
7110	struct fs_mount* mount = descriptor->u.mount;
7111
7112	if (HAS_FS_MOUNT_CALL(mount, rewind_index_dir))
7113		return FS_MOUNT_CALL(mount, rewind_index_dir, descriptor->cookie);
7114
7115	return B_UNSUPPORTED;
7116}
7117
7118
7119static status_t
7120index_create(dev_t mountID, const char* name, uint32 type, uint32 flags,
7121	bool kernel)
7122{
7123	FUNCTION(("index_create(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7124		mountID, name, kernel));
7125
7126	struct fs_mount* mount;
7127	status_t status = get_mount(mountID, &mount);
7128	if (status != B_OK)
7129		return status;
7130
7131	if (!HAS_FS_MOUNT_CALL(mount, create_index)) {
7132		status = B_READ_ONLY_DEVICE;
7133		goto out;
7134	}
7135
7136	status = FS_MOUNT_CALL(mount, create_index, name, type, flags);
7137
7138out:
7139	put_mount(mount);
7140	return status;
7141}
7142
7143
7144#if 0
7145static status_t
7146index_read_stat(struct file_descriptor* descriptor, struct stat* stat)
7147{
7148	struct vnode* vnode = descriptor->u.vnode;
7149
7150	// ToDo: currently unused!
7151	FUNCTION(("index_read_stat: stat 0x%p\n", stat));
7152	if (!HAS_FS_CALL(vnode, read_index_stat))
7153		return B_UNSUPPORTED;
7154
7155	return B_UNSUPPORTED;
7156	//return FS_CALL(vnode, read_index_stat, descriptor->cookie, stat);
7157}
7158
7159
7160static void
7161index_free_fd(struct file_descriptor* descriptor)
7162{
7163	struct vnode* vnode = descriptor->u.vnode;
7164
7165	if (vnode != NULL) {
7166		FS_CALL(vnode, free_index_cookie, descriptor->cookie);
7167		put_vnode(vnode);
7168	}
7169}
7170#endif
7171
7172
7173static status_t
7174index_name_read_stat(dev_t mountID, const char* name, struct stat* stat,
7175	bool kernel)
7176{
7177	FUNCTION(("index_remove(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7178		mountID, name, kernel));
7179
7180	struct fs_mount* mount;
7181	status_t status = get_mount(mountID, &mount);
7182	if (status != B_OK)
7183		return status;
7184
7185	if (!HAS_FS_MOUNT_CALL(mount, read_index_stat)) {
7186		status = B_UNSUPPORTED;
7187		goto out;
7188	}
7189
7190	status = FS_MOUNT_CALL(mount, read_index_stat, name, stat);
7191
7192out:
7193	put_mount(mount);
7194	return status;
7195}
7196
7197
7198static status_t
7199index_remove(dev_t mountID, const char* name, bool kernel)
7200{
7201	FUNCTION(("index_remove(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7202		mountID, name, kernel));
7203
7204	struct fs_mount* mount;
7205	status_t status = get_mount(mountID, &mount);
7206	if (status != B_OK)
7207		return status;
7208
7209	if (!HAS_FS_MOUNT_CALL(mount, remove_index)) {
7210		status = B_READ_ONLY_DEVICE;
7211		goto out;
7212	}
7213
7214	status = FS_MOUNT_CALL(mount, remove_index, name);
7215
7216out:
7217	put_mount(mount);
7218	return status;
7219}
7220
7221
7222/*!	TODO: the query FS API is still the pretty much the same as in R5.
7223		It would be nice if the FS would find some more kernel support
7224		for them.
7225		For example, query parsing should be moved into the kernel.
7226*/
7227static int
7228query_open(dev_t device, const char* query, uint32 flags, port_id port,
7229	int32 token, bool kernel)
7230{
7231	struct fs_mount* mount;
7232	void* cookie;
7233
7234	FUNCTION(("query_open(device = %" B_PRId32 ", query = \"%s\", kernel = %d)\n",
7235		device, query, kernel));
7236
7237	status_t status = get_mount(device, &mount);
7238	if (status != B_OK)
7239		return status;
7240
7241	if (!HAS_FS_MOUNT_CALL(mount, open_query)) {
7242		status = B_UNSUPPORTED;
7243		goto error;
7244	}
7245
7246	status = FS_MOUNT_CALL(mount, open_query, query, flags, port, token,
7247		&cookie);
7248	if (status != B_OK)
7249		goto error;
7250
7251	// get fd for the index directory
7252	int fd;
7253	fd = get_new_fd(FDTYPE_QUERY, mount, NULL, cookie, O_CLOEXEC, kernel);
7254	if (fd >= 0)
7255		return fd;
7256
7257	status = fd;
7258
7259	// something went wrong
7260	FS_MOUNT_CALL(mount, close_query, cookie);
7261	FS_MOUNT_CALL(mount, free_query_cookie, cookie);
7262
7263error:
7264	put_mount(mount);
7265	return status;
7266}
7267
7268
7269static status_t
7270query_close(struct file_descriptor* descriptor)
7271{
7272	struct fs_mount* mount = descriptor->u.mount;
7273
7274	FUNCTION(("query_close(descriptor = %p)\n", descriptor));
7275
7276	if (HAS_FS_MOUNT_CALL(mount, close_query))
7277		return FS_MOUNT_CALL(mount, close_query, descriptor->cookie);
7278
7279	return B_OK;
7280}
7281
7282
7283static void
7284query_free_fd(struct file_descriptor* descriptor)
7285{
7286	struct fs_mount* mount = descriptor->u.mount;
7287
7288	if (mount != NULL) {
7289		FS_MOUNT_CALL(mount, free_query_cookie, descriptor->cookie);
7290		put_mount(mount);
7291	}
7292}
7293
7294
7295static status_t
7296query_read(struct io_context* ioContext, struct file_descriptor* descriptor,
7297	struct dirent* buffer, size_t bufferSize, uint32* _count)
7298{
7299	struct fs_mount* mount = descriptor->u.mount;
7300
7301	if (HAS_FS_MOUNT_CALL(mount, read_query)) {
7302		return FS_MOUNT_CALL(mount, read_query, descriptor->cookie, buffer,
7303			bufferSize, _count);
7304	}
7305
7306	return B_UNSUPPORTED;
7307}
7308
7309
7310static status_t
7311query_rewind(struct file_descriptor* descriptor)
7312{
7313	struct fs_mount* mount = descriptor->u.mount;
7314
7315	if (HAS_FS_MOUNT_CALL(mount, rewind_query))
7316		return FS_MOUNT_CALL(mount, rewind_query, descriptor->cookie);
7317
7318	return B_UNSUPPORTED;
7319}
7320
7321
7322//	#pragma mark - General File System functions
7323
7324
7325static dev_t
7326fs_mount(char* path, const char* device, const char* fsName, uint32 flags,
7327	const char* args, bool kernel)
7328{
7329	struct ::fs_mount* mount;
7330	status_t status = B_OK;
7331	fs_volume* volume = NULL;
7332	int32 layer = 0;
7333	Vnode* coveredNode = NULL;
7334
7335	FUNCTION(("fs_mount: path = '%s', device = '%s', fs_name = '%s', flags = %#"
7336		B_PRIx32 ", args = '%s'\n", path, device, fsName, flags, args));
7337
7338	// The path is always safe, we just have to make sure that fsName is
7339	// almost valid - we can't make any assumptions about args, though.
7340	// A NULL fsName is OK, if a device was given and the FS is not virtual.
7341	// We'll get it from the DDM later.
7342	if (fsName == NULL) {
7343		if (!device || flags & B_MOUNT_VIRTUAL_DEVICE)
7344			return B_BAD_VALUE;
7345	} else if (fsName[0] == '\0')
7346		return B_BAD_VALUE;
7347
7348	RecursiveLocker mountOpLocker(sMountOpLock);
7349
7350	// Helper to delete a newly created file device on failure.
7351	// Not exactly beautiful, but helps to keep the code below cleaner.
7352	struct FileDeviceDeleter {
7353		FileDeviceDeleter() : id(-1) {}
7354		~FileDeviceDeleter()
7355		{
7356			KDiskDeviceManager::Default()->DeleteFileDevice(id);
7357		}
7358
7359		partition_id id;
7360	} fileDeviceDeleter;
7361
7362	// If the file system is not a "virtual" one, the device argument should
7363	// point to a real file/device (if given at all).
7364	// get the partition
7365	KDiskDeviceManager* ddm = KDiskDeviceManager::Default();
7366	KPartition* partition = NULL;
7367	KPath normalizedDevice;
7368	bool newlyCreatedFileDevice = false;
7369
7370	if (!(flags & B_MOUNT_VIRTUAL_DEVICE) && device != NULL) {
7371		// normalize the device path
7372		status = normalizedDevice.SetTo(device, true);
7373		if (status != B_OK)
7374			return status;
7375
7376		// get a corresponding partition from the DDM
7377		partition = ddm->RegisterPartition(normalizedDevice.Path());
7378		if (partition == NULL) {
7379			// Partition not found: This either means, the user supplied
7380			// an invalid path, or the path refers to an image file. We try
7381			// to let the DDM create a file device for the path.
7382			partition_id deviceID = ddm->CreateFileDevice(
7383				normalizedDevice.Path(), &newlyCreatedFileDevice);
7384			if (deviceID >= 0) {
7385				partition = ddm->RegisterPartition(deviceID);
7386				if (newlyCreatedFileDevice)
7387					fileDeviceDeleter.id = deviceID;
7388			}
7389		}
7390
7391		if (!partition) {
7392			TRACE(("fs_mount(): Partition `%s' not found.\n",
7393				normalizedDevice.Path()));
7394			return B_ENTRY_NOT_FOUND;
7395		}
7396
7397		device = normalizedDevice.Path();
7398			// correct path to file device
7399	}
7400	PartitionRegistrar partitionRegistrar(partition, true);
7401
7402	// Write lock the partition's device. For the time being, we keep the lock
7403	// until we're done mounting -- not nice, but ensure, that no-one is
7404	// interfering.
7405	// TODO: Just mark the partition busy while mounting!
7406	KDiskDevice* diskDevice = NULL;
7407	if (partition) {
7408		diskDevice = ddm->WriteLockDevice(partition->Device()->ID());
7409		if (!diskDevice) {
7410			TRACE(("fs_mount(): Failed to lock disk device!\n"));
7411			return B_ERROR;
7412		}
7413	}
7414
7415	DeviceWriteLocker writeLocker(diskDevice, true);
7416		// this takes over the write lock acquired before
7417
7418	if (partition != NULL) {
7419		// make sure, that the partition is not busy
7420		if (partition->IsBusy()) {
7421			TRACE(("fs_mount(): Partition is busy.\n"));
7422			return B_BUSY;
7423		}
7424
7425		// if no FS name had been supplied, we get it from the partition
7426		if (fsName == NULL) {
7427			KDiskSystem* diskSystem = partition->DiskSystem();
7428			if (!diskSystem) {
7429				TRACE(("fs_mount(): No FS name was given, and the DDM didn't "
7430					"recognize it.\n"));
7431				return B_BAD_VALUE;
7432			}
7433
7434			if (!diskSystem->IsFileSystem()) {
7435				TRACE(("fs_mount(): No FS name was given, and the DDM found a "
7436					"partitioning system.\n"));
7437				return B_BAD_VALUE;
7438			}
7439
7440			// The disk system name will not change, and the KDiskSystem
7441			// object will not go away while the disk device is locked (and
7442			// the partition has a reference to it), so this is safe.
7443			fsName = diskSystem->Name();
7444		}
7445	}
7446
7447	mount = new(std::nothrow) (struct ::fs_mount);
7448	if (mount == NULL)
7449		return B_NO_MEMORY;
7450
7451	mount->device_name = strdup(device);
7452		// "device" can be NULL
7453
7454	status = mount->entry_cache.Init();
7455	if (status != B_OK)
7456		goto err1;
7457
7458	// initialize structure
7459	mount->id = sNextMountID++;
7460	mount->partition = NULL;
7461	mount->root_vnode = NULL;
7462	mount->covers_vnode = NULL;
7463	mount->unmounting = false;
7464	mount->owns_file_device = false;
7465	mount->volume = NULL;
7466
7467	// build up the volume(s)
7468	while (true) {
7469		char* layerFSName = get_file_system_name_for_layer(fsName, layer);
7470		if (layerFSName == NULL) {
7471			if (layer == 0) {
7472				status = B_NO_MEMORY;
7473				goto err1;
7474			}
7475
7476			break;
7477		}
7478		MemoryDeleter layerFSNameDeleter(layerFSName);
7479
7480		volume = (fs_volume*)malloc(sizeof(fs_volume));
7481		if (volume == NULL) {
7482			status = B_NO_MEMORY;
7483			goto err1;
7484		}
7485
7486		volume->id = mount->id;
7487		volume->partition = partition != NULL ? partition->ID() : -1;
7488		volume->layer = layer++;
7489		volume->private_volume = NULL;
7490		volume->ops = NULL;
7491		volume->sub_volume = NULL;
7492		volume->super_volume = NULL;
7493		volume->file_system = NULL;
7494		volume->file_system_name = NULL;
7495
7496		volume->file_system_name = get_file_system_name(layerFSName);
7497		if (volume->file_system_name == NULL) {
7498			status = B_NO_MEMORY;
7499			free(volume);
7500			goto err1;
7501		}
7502
7503		volume->file_system = get_file_system(layerFSName);
7504		if (volume->file_system == NULL) {
7505			status = B_DEVICE_NOT_FOUND;
7506			free(volume->file_system_name);
7507			free(volume);
7508			goto err1;
7509		}
7510
7511		if (mount->volume == NULL)
7512			mount->volume = volume;
7513		else {
7514			volume->super_volume = mount->volume;
7515			mount->volume->sub_volume = volume;
7516			mount->volume = volume;
7517		}
7518	}
7519
7520	// insert mount struct into list before we call FS's mount() function
7521	// so that vnodes can be created for this mount
7522	rw_lock_write_lock(&sMountLock);
7523	sMountsTable->Insert(mount);
7524	rw_lock_write_unlock(&sMountLock);
7525
7526	ino_t rootID;
7527
7528	if (!sRoot) {
7529		// we haven't mounted anything yet
7530		if (strcmp(path, "/") != 0) {
7531			status = B_ERROR;
7532			goto err2;
7533		}
7534
7535		status = mount->volume->file_system->mount(mount->volume, device, flags,
7536			args, &rootID);
7537		if (status != B_OK || mount->volume->ops == NULL)
7538			goto err2;
7539	} else {
7540		{
7541			VnodePutter temp;
7542			status = path_to_vnode(path, true, temp, NULL, kernel);
7543			coveredNode = temp.Detach();
7544		}
7545		if (status != B_OK)
7546			goto err2;
7547
7548		mount->covers_vnode = coveredNode;
7549
7550		// make sure covered_vnode is a directory
7551		if (!S_ISDIR(coveredNode->Type())) {
7552			status = B_NOT_A_DIRECTORY;
7553			goto err3;
7554		}
7555
7556		if (coveredNode->IsCovered()) {
7557			// this is already a covered vnode
7558			status = B_BUSY;
7559			goto err3;
7560		}
7561
7562		// mount it/them
7563		fs_volume* volume = mount->volume;
7564		while (volume) {
7565			status = volume->file_system->mount(volume, device, flags, args,
7566				&rootID);
7567			if (status != B_OK || volume->ops == NULL) {
7568				if (status == B_OK && volume->ops == NULL)
7569					panic("fs_mount: mount() succeeded but ops is NULL!");
7570				if (volume->sub_volume)
7571					goto err4;
7572				goto err3;
7573			}
7574
7575			volume = volume->super_volume;
7576		}
7577
7578		volume = mount->volume;
7579		while (volume) {
7580			if (volume->ops->all_layers_mounted != NULL)
7581				volume->ops->all_layers_mounted(volume);
7582			volume = volume->super_volume;
7583		}
7584	}
7585
7586	// the root node is supposed to be owned by the file system - it must
7587	// exist at this point
7588	rw_lock_write_lock(&sVnodeLock);
7589	mount->root_vnode = lookup_vnode(mount->id, rootID);
7590	if (mount->root_vnode == NULL || mount->root_vnode->ref_count != 1) {
7591		panic("fs_mount: file system does not own its root node!\n");
7592		status = B_ERROR;
7593		rw_lock_write_unlock(&sVnodeLock);
7594		goto err4;
7595	}
7596
7597	// set up the links between the root vnode and the vnode it covers
7598	if (coveredNode != NULL) {
7599		if (coveredNode->IsCovered()) {
7600			// the vnode is covered now
7601			status = B_BUSY;
7602			rw_lock_write_unlock(&sVnodeLock);
7603			goto err4;
7604		}
7605
7606		mount->root_vnode->covers = coveredNode;
7607		mount->root_vnode->SetCovering(true);
7608
7609		coveredNode->covered_by = mount->root_vnode;
7610		coveredNode->SetCovered(true);
7611	}
7612	rw_lock_write_unlock(&sVnodeLock);
7613
7614	if (!sRoot) {
7615		sRoot = mount->root_vnode;
7616		mutex_lock(&sIOContextRootLock);
7617		get_current_io_context(true)->root = sRoot;
7618		mutex_unlock(&sIOContextRootLock);
7619		inc_vnode_ref_count(sRoot);
7620	}
7621
7622	// supply the partition (if any) with the mount cookie and mark it mounted
7623	if (partition) {
7624		partition->SetMountCookie(mount->volume->private_volume);
7625		partition->SetVolumeID(mount->id);
7626
7627		// keep a partition reference as long as the partition is mounted
7628		partitionRegistrar.Detach();
7629		mount->partition = partition;
7630		mount->owns_file_device = newlyCreatedFileDevice;
7631		fileDeviceDeleter.id = -1;
7632	}
7633
7634	notify_mount(mount->id,
7635		coveredNode != NULL ? coveredNode->device : -1,
7636		coveredNode ? coveredNode->id : -1);
7637
7638	return mount->id;
7639
7640err4:
7641	FS_MOUNT_CALL_NO_PARAMS(mount, unmount);
7642err3:
7643	if (coveredNode != NULL)
7644		put_vnode(coveredNode);
7645err2:
7646	rw_lock_write_lock(&sMountLock);
7647	sMountsTable->Remove(mount);
7648	rw_lock_write_unlock(&sMountLock);
7649err1:
7650	delete mount;
7651
7652	return status;
7653}
7654
7655
7656static status_t
7657fs_unmount(char* path, dev_t mountID, uint32 flags, bool kernel)
7658{
7659	struct fs_mount* mount;
7660	status_t err;
7661
7662	FUNCTION(("fs_unmount(path '%s', dev %" B_PRId32 ", kernel %d\n", path,
7663		mountID, kernel));
7664
7665	VnodePutter pathVnode;
7666	if (path != NULL) {
7667		err = path_to_vnode(path, true, pathVnode, NULL, kernel);
7668		if (err != B_OK)
7669			return B_ENTRY_NOT_FOUND;
7670	}
7671
7672	RecursiveLocker mountOpLocker(sMountOpLock);
7673	ReadLocker mountLocker(sMountLock);
7674
7675	mount = find_mount(path != NULL ? pathVnode->device : mountID);
7676	if (mount == NULL) {
7677		panic("fs_unmount: find_mount() failed on root vnode @%p of mount\n",
7678			pathVnode.Get());
7679	}
7680
7681	mountLocker.Unlock();
7682
7683	if (path != NULL) {
7684		if (mount->root_vnode != pathVnode.Get()) {
7685			// not mountpoint
7686			return B_BAD_VALUE;
7687		}
7688
7689		pathVnode.Unset();
7690	}
7691
7692	// if the volume is associated with a partition, lock the device of the
7693	// partition as long as we are unmounting
7694	KDiskDeviceManager* ddm = KDiskDeviceManager::Default();
7695	KPartition* partition = mount->partition;
7696	KDiskDevice* diskDevice = NULL;
7697	if (partition != NULL) {
7698		if (partition->Device() == NULL) {
7699			dprintf("fs_unmount(): There is no device!\n");
7700			return B_ERROR;
7701		}
7702		diskDevice = ddm->WriteLockDevice(partition->Device()->ID());
7703		if (!diskDevice) {
7704			TRACE(("fs_unmount(): Failed to lock disk device!\n"));
7705			return B_ERROR;
7706		}
7707	}
7708	DeviceWriteLocker writeLocker(diskDevice, true);
7709
7710	// make sure, that the partition is not busy
7711	if (partition != NULL) {
7712		if ((flags & B_UNMOUNT_BUSY_PARTITION) == 0 && partition->IsBusy()) {
7713			dprintf("fs_unmount(): Partition is busy.\n");
7714			return B_BUSY;
7715		}
7716	}
7717
7718	// grab the vnode master mutex to keep someone from creating
7719	// a vnode while we're figuring out if we can continue
7720	WriteLocker vnodesWriteLocker(&sVnodeLock);
7721
7722	bool disconnectedDescriptors = false;
7723
7724	while (true) {
7725		bool busy = false;
7726
7727		// cycle through the list of vnodes associated with this mount and
7728		// make sure all of them are not busy or have refs on them
7729		VnodeList::Iterator iterator = mount->vnodes.GetIterator();
7730		while (struct vnode* vnode = iterator.Next()) {
7731			if (vnode->IsBusy()) {
7732				dprintf("fs_unmount(): inode %" B_PRIdINO " is busy\n", vnode->id);
7733				busy = true;
7734				break;
7735			}
7736
7737			// check the vnode's ref count -- subtract additional references for
7738			// covering
7739			int32 refCount = vnode->ref_count;
7740			if (vnode->covers != NULL)
7741				refCount--;
7742			if (vnode->covered_by != NULL)
7743				refCount--;
7744
7745			if (refCount != 0) {
7746				dprintf("fs_unmount(): inode %" B_PRIdINO " is still referenced\n", vnode->id);
7747				// there are still vnodes in use on this mount, so we cannot
7748				// unmount yet
7749				busy = true;
7750				break;
7751			}
7752		}
7753
7754		if (!busy)
7755			break;
7756
7757		if ((flags & B_FORCE_UNMOUNT) == 0)
7758			return B_BUSY;
7759
7760		if (disconnectedDescriptors) {
7761			// wait a bit until the last access is finished, and then try again
7762			vnodesWriteLocker.Unlock();
7763			snooze(100000);
7764			// TODO: if there is some kind of bug that prevents the ref counts
7765			// from getting back to zero, this will fall into an endless loop...
7766			vnodesWriteLocker.Lock();
7767			continue;
7768		}
7769
7770		// the file system is still busy - but we're forced to unmount it,
7771		// so let's disconnect all open file descriptors
7772
7773		mount->unmounting = true;
7774			// prevent new vnodes from being created
7775
7776		vnodesWriteLocker.Unlock();
7777
7778		disconnect_mount_or_vnode_fds(mount, NULL);
7779		disconnectedDescriptors = true;
7780
7781		vnodesWriteLocker.Lock();
7782	}
7783
7784	// We can safely continue. Mark all of the vnodes busy and this mount
7785	// structure in unmounting state. Also undo the vnode covers/covered_by
7786	// links.
7787	mount->unmounting = true;
7788
7789	VnodeList::Iterator iterator = mount->vnodes.GetIterator();
7790	while (struct vnode* vnode = iterator.Next()) {
7791		// Remove all covers/covered_by links from other mounts' nodes to this
7792		// vnode and adjust the node ref count accordingly. We will release the
7793		// references to the external vnodes below.
7794		if (Vnode* coveredNode = vnode->covers) {
7795			if (Vnode* coveringNode = vnode->covered_by) {
7796				// We have both covered and covering vnodes, so just remove us
7797				// from the chain.
7798				coveredNode->covered_by = coveringNode;
7799				coveringNode->covers = coveredNode;
7800				vnode->ref_count -= 2;
7801
7802				vnode->covered_by = NULL;
7803				vnode->covers = NULL;
7804				vnode->SetCovering(false);
7805				vnode->SetCovered(false);
7806			} else {
7807				// We only have a covered vnode. Remove its link to us.
7808				coveredNode->covered_by = NULL;
7809				coveredNode->SetCovered(false);
7810				vnode->ref_count--;
7811
7812				// If the other node is an external vnode, we keep its link
7813				// link around so we can put the reference later on. Otherwise
7814				// we get rid of it right now.
7815				if (coveredNode->mount == mount) {
7816					vnode->covers = NULL;
7817					coveredNode->ref_count--;
7818				}
7819			}
7820		} else if (Vnode* coveringNode = vnode->covered_by) {
7821			// We only have a covering vnode. Remove its link to us.
7822			coveringNode->covers = NULL;
7823			coveringNode->SetCovering(false);
7824			vnode->ref_count--;
7825
7826			// If the other node is an external vnode, we keep its link
7827			// link around so we can put the reference later on. Otherwise
7828			// we get rid of it right now.
7829			if (coveringNode->mount == mount) {
7830				vnode->covered_by = NULL;
7831				coveringNode->ref_count--;
7832			}
7833		}
7834
7835		vnode->SetBusy(true);
7836		vnode_to_be_freed(vnode);
7837	}
7838
7839	vnodesWriteLocker.Unlock();
7840
7841	// Free all vnodes associated with this mount.
7842	// They will be removed from the mount list by free_vnode(), so
7843	// we don't have to do this.
7844	while (struct vnode* vnode = mount->vnodes.Head()) {
7845		// Put the references to external covered/covering vnodes we kept above.
7846		if (Vnode* coveredNode = vnode->covers)
7847			put_vnode(coveredNode);
7848		if (Vnode* coveringNode = vnode->covered_by)
7849			put_vnode(coveringNode);
7850
7851		free_vnode(vnode, false);
7852	}
7853
7854	// remove the mount structure from the hash table
7855	rw_lock_write_lock(&sMountLock);
7856	sMountsTable->Remove(mount);
7857	rw_lock_write_unlock(&sMountLock);
7858
7859	mountOpLocker.Unlock();
7860
7861	FS_MOUNT_CALL_NO_PARAMS(mount, unmount);
7862	notify_unmount(mount->id);
7863
7864	// dereference the partition and mark it unmounted
7865	if (partition) {
7866		partition->SetVolumeID(-1);
7867		partition->SetMountCookie(NULL);
7868
7869		if (mount->owns_file_device)
7870			KDiskDeviceManager::Default()->DeleteFileDevice(partition->ID());
7871		partition->Unregister();
7872	}
7873
7874	delete mount;
7875	return B_OK;
7876}
7877
7878
7879static status_t
7880fs_sync(dev_t device)
7881{
7882	struct fs_mount* mount;
7883	status_t status = get_mount(device, &mount);
7884	if (status != B_OK)
7885		return status;
7886
7887	struct vnode marker;
7888	memset(&marker, 0, sizeof(marker));
7889	marker.SetBusy(true);
7890	marker.SetRemoved(true);
7891
7892	// First, synchronize all file caches
7893
7894	while (true) {
7895		WriteLocker locker(sVnodeLock);
7896			// Note: That's the easy way. Which is probably OK for sync(),
7897			// since it's a relatively rare call and doesn't need to allow for
7898			// a lot of concurrency. Using a read lock would be possible, but
7899			// also more involved, since we had to lock the individual nodes
7900			// and take care of the locking order, which we might not want to
7901			// do while holding fs_mount::lock.
7902
7903		// synchronize access to vnode list
7904		mutex_lock(&mount->lock);
7905
7906		struct vnode* vnode;
7907		if (!marker.IsRemoved()) {
7908			vnode = mount->vnodes.GetNext(&marker);
7909			mount->vnodes.Remove(&marker);
7910			marker.SetRemoved(true);
7911		} else
7912			vnode = mount->vnodes.First();
7913
7914		while (vnode != NULL && (vnode->cache == NULL
7915			|| vnode->IsRemoved() || vnode->IsBusy())) {
7916			// TODO: we could track writes (and writable mapped vnodes)
7917			//	and have a simple flag that we could test for here
7918			vnode = mount->vnodes.GetNext(vnode);
7919		}
7920
7921		if (vnode != NULL) {
7922			// insert marker vnode again
7923			mount->vnodes.InsertBefore(mount->vnodes.GetNext(vnode), &marker);
7924			marker.SetRemoved(false);
7925		}
7926
7927		mutex_unlock(&mount->lock);
7928
7929		if (vnode == NULL)
7930			break;
7931
7932		vnode = lookup_vnode(mount->id, vnode->id);
7933		if (vnode == NULL || vnode->IsBusy())
7934			continue;
7935
7936		if (vnode->ref_count == 0) {
7937			// this vnode has been unused before
7938			vnode_used(vnode);
7939		}
7940		inc_vnode_ref_count(vnode);
7941
7942		locker.Unlock();
7943
7944		if (vnode->cache != NULL && !vnode->IsRemoved())
7945			vnode->cache->WriteModified();
7946
7947		put_vnode(vnode);
7948	}
7949
7950	// Let the file systems do their synchronizing work
7951	if (HAS_FS_MOUNT_CALL(mount, sync))
7952		status = FS_MOUNT_CALL_NO_PARAMS(mount, sync);
7953
7954	// Finally, flush the underlying device's write cache (if possible.)
7955	if (mount->partition != NULL && mount->partition->Device() != NULL)
7956		ioctl(mount->partition->Device()->FD(), B_FLUSH_DRIVE_CACHE);
7957
7958	put_mount(mount);
7959	return status;
7960}
7961
7962
7963static status_t
7964fs_read_info(dev_t device, struct fs_info* info)
7965{
7966	struct fs_mount* mount;
7967	status_t status = get_mount(device, &mount);
7968	if (status != B_OK)
7969		return status;
7970
7971	memset(info, 0, sizeof(struct fs_info));
7972
7973	if (HAS_FS_MOUNT_CALL(mount, read_fs_info))
7974		status = FS_MOUNT_CALL(mount, read_fs_info, info);
7975
7976	// fill in info the file system doesn't (have to) know about
7977	if (status == B_OK) {
7978		info->dev = mount->id;
7979		info->root = mount->root_vnode->id;
7980
7981		fs_volume* volume = mount->volume;
7982		while (volume->super_volume != NULL)
7983			volume = volume->super_volume;
7984
7985		strlcpy(info->fsh_name, volume->file_system_name,
7986			sizeof(info->fsh_name));
7987		if (mount->device_name != NULL) {
7988			strlcpy(info->device_name, mount->device_name,
7989				sizeof(info->device_name));
7990		}
7991	}
7992
7993	// if the call is not supported by the file system, there are still
7994	// the parts that we filled out ourselves
7995
7996	put_mount(mount);
7997	return status;
7998}
7999
8000
8001static status_t
8002fs_write_info(dev_t device, const struct fs_info* info, int mask)
8003{
8004	struct fs_mount* mount;
8005	status_t status = get_mount(device, &mount);
8006	if (status != B_OK)
8007		return status;
8008
8009	if (HAS_FS_MOUNT_CALL(mount, write_fs_info))
8010		status = FS_MOUNT_CALL(mount, write_fs_info, info, mask);
8011	else
8012		status = B_READ_ONLY_DEVICE;
8013
8014	put_mount(mount);
8015	return status;
8016}
8017
8018
8019static dev_t
8020fs_next_device(int32* _cookie)
8021{
8022	struct fs_mount* mount = NULL;
8023	dev_t device = *_cookie;
8024
8025	rw_lock_read_lock(&sMountLock);
8026
8027	// Since device IDs are assigned sequentially, this algorithm
8028	// does work good enough. It makes sure that the device list
8029	// returned is sorted, and that no device is skipped when an
8030	// already visited device got unmounted.
8031
8032	while (device < sNextMountID) {
8033		mount = find_mount(device++);
8034		if (mount != NULL && mount->volume->private_volume != NULL)
8035			break;
8036	}
8037
8038	*_cookie = device;
8039
8040	if (mount != NULL)
8041		device = mount->id;
8042	else
8043		device = B_BAD_VALUE;
8044
8045	rw_lock_read_unlock(&sMountLock);
8046
8047	return device;
8048}
8049
8050
8051ssize_t
8052fs_read_attr(int fd, const char *attribute, uint32 type, off_t pos,
8053	void *buffer, size_t readBytes)
8054{
8055	int attrFD = attr_open(fd, NULL, attribute, O_RDONLY, true);
8056	if (attrFD < 0)
8057		return attrFD;
8058
8059	ssize_t bytesRead = _kern_read(attrFD, pos, buffer, readBytes);
8060
8061	_kern_close(attrFD);
8062
8063	return bytesRead;
8064}
8065
8066
8067static status_t
8068get_cwd(char* buffer, size_t size, bool kernel)
8069{
8070	// Get current working directory from io context
8071	struct io_context* context = get_current_io_context(kernel);
8072	status_t status;
8073
8074	FUNCTION(("vfs_get_cwd: buf %p, size %ld\n", buffer, size));
8075
8076	mutex_lock(&context->io_mutex);
8077
8078	struct vnode* vnode = context->cwd;
8079	if (vnode)
8080		inc_vnode_ref_count(vnode);
8081
8082	mutex_unlock(&context->io_mutex);
8083
8084	if (vnode) {
8085		status = dir_vnode_to_path(vnode, buffer, size, kernel);
8086		put_vnode(vnode);
8087	} else
8088		status = B_ERROR;
8089
8090	return status;
8091}
8092
8093
8094static status_t
8095set_cwd(int fd, char* path, bool kernel)
8096{
8097	struct io_context* context;
8098	struct vnode* oldDirectory;
8099
8100	FUNCTION(("set_cwd: path = \'%s\'\n", path));
8101
8102	// Get vnode for passed path, and bail if it failed
8103	VnodePutter vnode;
8104	status_t status = fd_and_path_to_vnode(fd, path, true, vnode, NULL, kernel);
8105	if (status < 0)
8106		return status;
8107
8108	if (!S_ISDIR(vnode->Type())) {
8109		// nope, can't cwd to here
8110		return B_NOT_A_DIRECTORY;
8111	}
8112
8113	// We need to have the permission to enter the directory, too
8114	if (HAS_FS_CALL(vnode, access)) {
8115		status = FS_CALL(vnode.Get(), access, X_OK);
8116		if (status != B_OK)
8117			return status;
8118	}
8119
8120	// Get current io context and lock
8121	context = get_current_io_context(kernel);
8122	mutex_lock(&context->io_mutex);
8123
8124	// save the old current working directory first
8125	oldDirectory = context->cwd;
8126	context->cwd = vnode.Detach();
8127
8128	mutex_unlock(&context->io_mutex);
8129
8130	if (oldDirectory)
8131		put_vnode(oldDirectory);
8132
8133	return B_NO_ERROR;
8134}
8135
8136
8137static status_t
8138user_copy_name(char* to, const char* from, size_t length)
8139{
8140	ssize_t len = user_strlcpy(to, from, length);
8141	if (len < 0)
8142		return len;
8143	if (len >= (ssize_t)length)
8144		return B_NAME_TOO_LONG;
8145	return B_OK;
8146}
8147
8148
8149//	#pragma mark - kernel mirrored syscalls
8150
8151
8152dev_t
8153_kern_mount(const char* path, const char* device, const char* fsName,
8154	uint32 flags, const char* args, size_t argsLength)
8155{
8156	KPath pathBuffer(path);
8157	if (pathBuffer.InitCheck() != B_OK)
8158		return B_NO_MEMORY;
8159
8160	return fs_mount(pathBuffer.LockBuffer(), device, fsName, flags, args, true);
8161}
8162
8163
8164status_t
8165_kern_unmount(const char* path, uint32 flags)
8166{
8167	KPath pathBuffer(path);
8168	if (pathBuffer.InitCheck() != B_OK)
8169		return B_NO_MEMORY;
8170
8171	return fs_unmount(pathBuffer.LockBuffer(), -1, flags, true);
8172}
8173
8174
8175status_t
8176_kern_read_fs_info(dev_t device, struct fs_info* info)
8177{
8178	if (info == NULL)
8179		return B_BAD_VALUE;
8180
8181	return fs_read_info(device, info);
8182}
8183
8184
8185status_t
8186_kern_write_fs_info(dev_t device, const struct fs_info* info, int mask)
8187{
8188	if (info == NULL)
8189		return B_BAD_VALUE;
8190
8191	return fs_write_info(device, info, mask);
8192}
8193
8194
8195status_t
8196_kern_sync(void)
8197{
8198	// Note: _kern_sync() is also called from _user_sync()
8199	int32 cookie = 0;
8200	dev_t device;
8201	while ((device = next_dev(&cookie)) >= 0) {
8202		status_t status = fs_sync(device);
8203		if (status != B_OK && status != B_BAD_VALUE) {
8204			dprintf("sync: device %" B_PRIdDEV " couldn't sync: %s\n", device,
8205				strerror(status));
8206		}
8207	}
8208
8209	return B_OK;
8210}
8211
8212
8213dev_t
8214_kern_next_device(int32* _cookie)
8215{
8216	return fs_next_device(_cookie);
8217}
8218
8219
8220status_t
8221_kern_get_next_fd_info(team_id teamID, uint32* _cookie, fd_info* info,
8222	size_t infoSize)
8223{
8224	if (infoSize != sizeof(fd_info))
8225		return B_BAD_VALUE;
8226
8227	// get the team
8228	Team* team = Team::Get(teamID);
8229	if (team == NULL)
8230		return B_BAD_TEAM_ID;
8231	BReference<Team> teamReference(team, true);
8232
8233	// now that we have a team reference, its I/O context won't go away
8234	io_context* context = team->io_context;
8235	MutexLocker contextLocker(context->io_mutex);
8236
8237	uint32 slot = *_cookie;
8238
8239	struct file_descriptor* descriptor;
8240	while (slot < context->table_size
8241		&& (descriptor = context->fds[slot]) == NULL) {
8242		slot++;
8243	}
8244
8245	if (slot >= context->table_size)
8246		return B_ENTRY_NOT_FOUND;
8247
8248	info->number = slot;
8249	info->open_mode = descriptor->open_mode;
8250
8251	struct vnode* vnode = fd_vnode(descriptor);
8252	if (vnode != NULL) {
8253		info->device = vnode->device;
8254		info->node = vnode->id;
8255	} else if (descriptor->u.mount != NULL) {
8256		info->device = descriptor->u.mount->id;
8257		info->node = -1;
8258	}
8259
8260	*_cookie = slot + 1;
8261	return B_OK;
8262}
8263
8264
8265int
8266_kern_open_entry_ref(dev_t device, ino_t inode, const char* name, int openMode,
8267	int perms)
8268{
8269	if ((openMode & O_CREAT) != 0) {
8270		return file_create_entry_ref(device, inode, name, openMode, perms,
8271			true);
8272	}
8273
8274	return file_open_entry_ref(device, inode, name, openMode, true);
8275}
8276
8277
8278/*!	\brief Opens a node specified by a FD + path pair.
8279
8280	At least one of \a fd and \a path must be specified.
8281	If only \a fd is given, the function opens the node identified by this
8282	FD. If only a path is given, this path is opened. If both are given and
8283	the path is absolute, \a fd is ignored; a relative path is reckoned off
8284	of the directory (!) identified by \a fd.
8285
8286	\param fd The FD. May be < 0.
8287	\param path The absolute or relative path. May be \c NULL.
8288	\param openMode The open mode.
8289	\return A FD referring to the newly opened node, or an error code,
8290			if an error occurs.
8291*/
8292int
8293_kern_open(int fd, const char* path, int openMode, int perms)
8294{
8295	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8296	if (pathBuffer.InitCheck() != B_OK)
8297		return B_NO_MEMORY;
8298
8299	if ((openMode & O_CREAT) != 0)
8300		return file_create(fd, pathBuffer.LockBuffer(), openMode, perms, true);
8301
8302	return file_open(fd, pathBuffer.LockBuffer(), openMode, true);
8303}
8304
8305
8306/*!	\brief Opens a directory specified by entry_ref or node_ref.
8307
8308	The supplied name may be \c NULL, in which case directory identified
8309	by \a device and \a inode will be opened. Otherwise \a device and
8310	\a inode identify the parent directory of the directory to be opened
8311	and \a name its entry name.
8312
8313	\param device If \a name is specified the ID of the device the parent
8314		   directory of the directory to be opened resides on, otherwise
8315		   the device of the directory itself.
8316	\param inode If \a name is specified the node ID of the parent
8317		   directory of the directory to be opened, otherwise node ID of the
8318		   directory itself.
8319	\param name The entry name of the directory to be opened. If \c NULL,
8320		   the \a device + \a inode pair identify the node to be opened.
8321	\return The FD of the newly opened directory or an error code, if
8322			something went wrong.
8323*/
8324int
8325_kern_open_dir_entry_ref(dev_t device, ino_t inode, const char* name)
8326{
8327	return dir_open_entry_ref(device, inode, name, true);
8328}
8329
8330
8331/*!	\brief Opens a directory specified by a FD + path pair.
8332
8333	At least one of \a fd and \a path must be specified.
8334	If only \a fd is given, the function opens the directory identified by this
8335	FD. If only a path is given, this path is opened. If both are given and
8336	the path is absolute, \a fd is ignored; a relative path is reckoned off
8337	of the directory (!) identified by \a fd.
8338
8339	\param fd The FD. May be < 0.
8340	\param path The absolute or relative path. May be \c NULL.
8341	\return A FD referring to the newly opened directory, or an error code,
8342			if an error occurs.
8343*/
8344int
8345_kern_open_dir(int fd, const char* path)
8346{
8347	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8348	if (pathBuffer.InitCheck() != B_OK)
8349		return B_NO_MEMORY;
8350
8351	return dir_open(fd, pathBuffer.LockBuffer(), true);
8352}
8353
8354
8355status_t
8356_kern_fcntl(int fd, int op, size_t argument)
8357{
8358	return common_fcntl(fd, op, argument, true);
8359}
8360
8361
8362status_t
8363_kern_fsync(int fd)
8364{
8365	return common_sync(fd, true);
8366}
8367
8368
8369status_t
8370_kern_lock_node(int fd)
8371{
8372	return common_lock_node(fd, true);
8373}
8374
8375
8376status_t
8377_kern_unlock_node(int fd)
8378{
8379	return common_unlock_node(fd, true);
8380}
8381
8382
8383status_t
8384_kern_preallocate(int fd, off_t offset, off_t length)
8385{
8386	return common_preallocate(fd, offset, length, true);
8387}
8388
8389
8390status_t
8391_kern_create_dir_entry_ref(dev_t device, ino_t inode, const char* name,
8392	int perms)
8393{
8394	return dir_create_entry_ref(device, inode, name, perms, true);
8395}
8396
8397
8398/*!	\brief Creates a directory specified by a FD + path pair.
8399
8400	\a path must always be specified (it contains the name of the new directory
8401	at least). If only a path is given, this path identifies the location at
8402	which the directory shall be created. If both \a fd and \a path are given
8403	and the path is absolute, \a fd is ignored; a relative path is reckoned off
8404	of the directory (!) identified by \a fd.
8405
8406	\param fd The FD. May be < 0.
8407	\param path The absolute or relative path. Must not be \c NULL.
8408	\param perms The access permissions the new directory shall have.
8409	\return \c B_OK, if the directory has been created successfully, another
8410			error code otherwise.
8411*/
8412status_t
8413_kern_create_dir(int fd, const char* path, int perms)
8414{
8415	KPath pathBuffer(path, KPath::DEFAULT);
8416	if (pathBuffer.InitCheck() != B_OK)
8417		return B_NO_MEMORY;
8418
8419	return dir_create(fd, pathBuffer.LockBuffer(), perms, true);
8420}
8421
8422
8423status_t
8424_kern_remove_dir(int fd, const char* path)
8425{
8426	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8427	if (pathBuffer.InitCheck() != B_OK)
8428		return B_NO_MEMORY;
8429
8430	return dir_remove(fd, pathBuffer.LockBuffer(), true);
8431}
8432
8433
8434/*!	\brief Reads the contents of a symlink referred to by a FD + path pair.
8435
8436	At least one of \a fd and \a path must be specified.
8437	If only \a fd is given, the function the symlink to be read is the node
8438	identified by this FD. If only a path is given, this path identifies the
8439	symlink to be read. If both are given and the path is absolute, \a fd is
8440	ignored; a relative path is reckoned off of the directory (!) identified
8441	by \a fd.
8442	If this function fails with B_BUFFER_OVERFLOW, the \a _bufferSize pointer
8443	will still be updated to reflect the required buffer size.
8444
8445	\param fd The FD. May be < 0.
8446	\param path The absolute or relative path. May be \c NULL.
8447	\param buffer The buffer into which the contents of the symlink shall be
8448		   written.
8449	\param _bufferSize A pointer to the size of the supplied buffer.
8450	\return The length of the link on success or an appropriate error code
8451*/
8452status_t
8453_kern_read_link(int fd, const char* path, char* buffer, size_t* _bufferSize)
8454{
8455	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8456	if (pathBuffer.InitCheck() != B_OK)
8457		return B_NO_MEMORY;
8458
8459	return common_read_link(fd, pathBuffer.LockBuffer(),
8460		buffer, _bufferSize, true);
8461}
8462
8463
8464/*!	\brief Creates a symlink specified by a FD + path pair.
8465
8466	\a path must always be specified (it contains the name of the new symlink
8467	at least). If only a path is given, this path identifies the location at
8468	which the symlink shall be created. If both \a fd and \a path are given and
8469	the path is absolute, \a fd is ignored; a relative path is reckoned off
8470	of the directory (!) identified by \a fd.
8471
8472	\param fd The FD. May be < 0.
8473	\param toPath The absolute or relative path. Must not be \c NULL.
8474	\param mode The access permissions the new symlink shall have.
8475	\return \c B_OK, if the symlink has been created successfully, another
8476			error code otherwise.
8477*/
8478status_t
8479_kern_create_symlink(int fd, const char* path, const char* toPath, int mode)
8480{
8481	KPath pathBuffer(path);
8482	if (pathBuffer.InitCheck() != B_OK)
8483		return B_NO_MEMORY;
8484
8485	return common_create_symlink(fd, pathBuffer.LockBuffer(),
8486		toPath, mode, true);
8487}
8488
8489
8490status_t
8491_kern_create_link(int pathFD, const char* path, int toFD, const char* toPath,
8492	bool traverseLeafLink)
8493{
8494	KPath pathBuffer(path);
8495	KPath toPathBuffer(toPath);
8496	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
8497		return B_NO_MEMORY;
8498
8499	return common_create_link(pathFD, pathBuffer.LockBuffer(), toFD,
8500		toPathBuffer.LockBuffer(), traverseLeafLink, true);
8501}
8502
8503
8504/*!	\brief Removes an entry specified by a FD + path pair from its directory.
8505
8506	\a path must always be specified (it contains at least the name of the entry
8507	to be deleted). If only a path is given, this path identifies the entry
8508	directly. If both \a fd and \a path are given and the path is absolute,
8509	\a fd is ignored; a relative path is reckoned off of the directory (!)
8510	identified by \a fd.
8511
8512	\param fd The FD. May be < 0.
8513	\param path The absolute or relative path. Must not be \c NULL.
8514	\return \c B_OK, if the entry has been removed successfully, another
8515			error code otherwise.
8516*/
8517status_t
8518_kern_unlink(int fd, const char* path)
8519{
8520	KPath pathBuffer(path);
8521	if (pathBuffer.InitCheck() != B_OK)
8522		return B_NO_MEMORY;
8523
8524	return common_unlink(fd, pathBuffer.LockBuffer(), true);
8525}
8526
8527
8528/*!	\brief Moves an entry specified by a FD + path pair to a an entry specified
8529		   by another FD + path pair.
8530
8531	\a oldPath and \a newPath must always be specified (they contain at least
8532	the name of the entry). If only a path is given, this path identifies the
8533	entry directly. If both a FD and a path are given and the path is absolute,
8534	the FD is ignored; a relative path is reckoned off of the directory (!)
8535	identified by the respective FD.
8536
8537	\param oldFD The FD of the old location. May be < 0.
8538	\param oldPath The absolute or relative path of the old location. Must not
8539		   be \c NULL.
8540	\param newFD The FD of the new location. May be < 0.
8541	\param newPath The absolute or relative path of the new location. Must not
8542		   be \c NULL.
8543	\return \c B_OK, if the entry has been moved successfully, another
8544			error code otherwise.
8545*/
8546status_t
8547_kern_rename(int oldFD, const char* oldPath, int newFD, const char* newPath)
8548{
8549	KPath oldPathBuffer(oldPath);
8550	KPath newPathBuffer(newPath);
8551	if (oldPathBuffer.InitCheck() != B_OK || newPathBuffer.InitCheck() != B_OK)
8552		return B_NO_MEMORY;
8553
8554	return common_rename(oldFD, oldPathBuffer.LockBuffer(),
8555		newFD, newPathBuffer.LockBuffer(), true);
8556}
8557
8558
8559status_t
8560_kern_access(int fd, const char* path, int mode, bool effectiveUserGroup)
8561{
8562	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8563	if (pathBuffer.InitCheck() != B_OK)
8564		return B_NO_MEMORY;
8565
8566	return common_access(fd, pathBuffer.LockBuffer(), mode, effectiveUserGroup,
8567		true);
8568}
8569
8570
8571/*!	\brief Reads stat data of an entity specified by a FD + path pair.
8572
8573	If only \a fd is given, the stat operation associated with the type
8574	of the FD (node, attr, attr dir etc.) is performed. If only \a path is
8575	given, this path identifies the entry for whose node to retrieve the
8576	stat data. If both \a fd and \a path are given and the path is absolute,
8577	\a fd is ignored; a relative path is reckoned off of the directory (!)
8578	identified by \a fd and specifies the entry whose stat data shall be
8579	retrieved.
8580
8581	\param fd The FD. May be < 0.
8582	\param path The absolute or relative path. Must not be \c NULL.
8583	\param traverseLeafLink If \a path is given, \c true specifies that the
8584		   function shall not stick to symlinks, but traverse them.
8585	\param stat The buffer the stat data shall be written into.
8586	\param statSize The size of the supplied stat buffer.
8587	\return \c B_OK, if the the stat data have been read successfully, another
8588			error code otherwise.
8589*/
8590status_t
8591_kern_read_stat(int fd, const char* path, bool traverseLeafLink,
8592	struct stat* stat, size_t statSize)
8593{
8594	struct stat completeStat;
8595	struct stat* originalStat = NULL;
8596	status_t status;
8597
8598	if (statSize > sizeof(struct stat))
8599		return B_BAD_VALUE;
8600
8601	// this supports different stat extensions
8602	if (statSize < sizeof(struct stat)) {
8603		originalStat = stat;
8604		stat = &completeStat;
8605	}
8606
8607	status = vfs_read_stat(fd, path, traverseLeafLink, stat, true);
8608
8609	if (status == B_OK && originalStat != NULL)
8610		memcpy(originalStat, stat, statSize);
8611
8612	return status;
8613}
8614
8615
8616/*!	\brief Writes stat data of an entity specified by a FD + path pair.
8617
8618	If only \a fd is given, the stat operation associated with the type
8619	of the FD (node, attr, attr dir etc.) is performed. If only \a path is
8620	given, this path identifies the entry for whose node to write the
8621	stat data. If both \a fd and \a path are given and the path is absolute,
8622	\a fd is ignored; a relative path is reckoned off of the directory (!)
8623	identified by \a fd and specifies the entry whose stat data shall be
8624	written.
8625
8626	\param fd The FD. May be < 0.
8627	\param path The absolute or relative path. May be \c NULL.
8628	\param traverseLeafLink If \a path is given, \c true specifies that the
8629		   function shall not stick to symlinks, but traverse them.
8630	\param stat The buffer containing the stat data to be written.
8631	\param statSize The size of the supplied stat buffer.
8632	\param statMask A mask specifying which parts of the stat data shall be
8633		   written.
8634	\return \c B_OK, if the the stat data have been written successfully,
8635			another error code otherwise.
8636*/
8637status_t
8638_kern_write_stat(int fd, const char* path, bool traverseLeafLink,
8639	const struct stat* stat, size_t statSize, int statMask)
8640{
8641	struct stat completeStat;
8642
8643	if (statSize > sizeof(struct stat))
8644		return B_BAD_VALUE;
8645
8646	// this supports different stat extensions
8647	if (statSize < sizeof(struct stat)) {
8648		memset((uint8*)&completeStat + statSize, 0,
8649			sizeof(struct stat) - statSize);
8650		memcpy(&completeStat, stat, statSize);
8651		stat = &completeStat;
8652	}
8653
8654	status_t status;
8655
8656	if (path != NULL) {
8657		// path given: write the stat of the node referred to by (fd, path)
8658		KPath pathBuffer(path);
8659		if (pathBuffer.InitCheck() != B_OK)
8660			return B_NO_MEMORY;
8661
8662		status = common_path_write_stat(fd, pathBuffer.LockBuffer(),
8663			traverseLeafLink, stat, statMask, true);
8664	} else {
8665		// no path given: get the FD and use the FD operation
8666		FileDescriptorPutter descriptor
8667			(get_fd(get_current_io_context(true), fd));
8668		if (!descriptor.IsSet())
8669			return B_FILE_ERROR;
8670
8671		if (descriptor->ops->fd_write_stat)
8672			status = descriptor->ops->fd_write_stat(descriptor.Get(), stat, statMask);
8673		else
8674			status = B_UNSUPPORTED;
8675	}
8676
8677	return status;
8678}
8679
8680
8681int
8682_kern_open_attr_dir(int fd, const char* path, bool traverseLeafLink)
8683{
8684	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8685	if (pathBuffer.InitCheck() != B_OK)
8686		return B_NO_MEMORY;
8687
8688	return attr_dir_open(fd, pathBuffer.LockBuffer(), traverseLeafLink, true);
8689}
8690
8691
8692int
8693_kern_open_attr(int fd, const char* path, const char* name, uint32 type,
8694	int openMode)
8695{
8696	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8697	if (pathBuffer.InitCheck() != B_OK)
8698		return B_NO_MEMORY;
8699
8700	if ((openMode & O_CREAT) != 0) {
8701		return attr_create(fd, pathBuffer.LockBuffer(), name, type, openMode,
8702			true);
8703	}
8704
8705	return attr_open(fd, pathBuffer.LockBuffer(), name, openMode, true);
8706}
8707
8708
8709status_t
8710_kern_remove_attr(int fd, const char* name)
8711{
8712	return attr_remove(fd, name, true);
8713}
8714
8715
8716status_t
8717_kern_rename_attr(int fromFile, const char* fromName, int toFile,
8718	const char* toName)
8719{
8720	return attr_rename(fromFile, fromName, toFile, toName, true);
8721}
8722
8723
8724int
8725_kern_open_index_dir(dev_t device)
8726{
8727	return index_dir_open(device, true);
8728}
8729
8730
8731status_t
8732_kern_create_index(dev_t device, const char* name, uint32 type, uint32 flags)
8733{
8734	return index_create(device, name, type, flags, true);
8735}
8736
8737
8738status_t
8739_kern_read_index_stat(dev_t device, const char* name, struct stat* stat)
8740{
8741	return index_name_read_stat(device, name, stat, true);
8742}
8743
8744
8745status_t
8746_kern_remove_index(dev_t device, const char* name)
8747{
8748	return index_remove(device, name, true);
8749}
8750
8751
8752status_t
8753_kern_getcwd(char* buffer, size_t size)
8754{
8755	TRACE(("_kern_getcwd: buf %p, %ld\n", buffer, size));
8756
8757	// Call vfs to get current working directory
8758	return get_cwd(buffer, size, true);
8759}
8760
8761
8762status_t
8763_kern_setcwd(int fd, const char* path)
8764{
8765	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8766	if (pathBuffer.InitCheck() != B_OK)
8767		return B_NO_MEMORY;
8768
8769	return set_cwd(fd, pathBuffer.LockBuffer(), true);
8770}
8771
8772
8773//	#pragma mark - userland syscalls
8774
8775
8776dev_t
8777_user_mount(const char* userPath, const char* userDevice,
8778	const char* userFileSystem, uint32 flags, const char* userArgs,
8779	size_t argsLength)
8780{
8781	char fileSystem[B_FILE_NAME_LENGTH];
8782	KPath path, device;
8783	char* args = NULL;
8784	status_t status;
8785
8786	if (!IS_USER_ADDRESS(userPath))
8787		return B_BAD_ADDRESS;
8788
8789	if (path.InitCheck() != B_OK || device.InitCheck() != B_OK)
8790		return B_NO_MEMORY;
8791
8792	status = user_copy_name(path.LockBuffer(), userPath,
8793		B_PATH_NAME_LENGTH);
8794	if (status != B_OK)
8795		return status;
8796	path.UnlockBuffer();
8797
8798	if (userFileSystem != NULL) {
8799		if (!IS_USER_ADDRESS(userFileSystem))
8800			return B_BAD_ADDRESS;
8801
8802		status = user_copy_name(fileSystem, userFileSystem, sizeof(fileSystem));
8803		if (status != B_OK)
8804			return status;
8805	}
8806
8807	if (userDevice != NULL) {
8808		if (!IS_USER_ADDRESS(userDevice))
8809			return B_BAD_ADDRESS;
8810
8811		status = user_copy_name(device.LockBuffer(), userDevice,
8812			B_PATH_NAME_LENGTH);
8813		if (status != B_OK)
8814			return status;
8815		device.UnlockBuffer();
8816	}
8817
8818	if (userArgs != NULL && argsLength > 0) {
8819		if (!IS_USER_ADDRESS(userArgs))
8820			return B_BAD_ADDRESS;
8821
8822		// this is a safety restriction
8823		if (argsLength >= 65536)
8824			return B_NAME_TOO_LONG;
8825
8826		args = (char*)malloc(argsLength + 1);
8827		if (args == NULL)
8828			return B_NO_MEMORY;
8829
8830		status = user_copy_name(args, userArgs, argsLength + 1);
8831		if (status != B_OK) {
8832			free(args);
8833			return status;
8834		}
8835	}
8836
8837	status = fs_mount(path.LockBuffer(),
8838		userDevice != NULL ? device.Path() : NULL,
8839		userFileSystem ? fileSystem : NULL, flags, args, false);
8840
8841	free(args);
8842	return status;
8843}
8844
8845
8846status_t
8847_user_unmount(const char* userPath, uint32 flags)
8848{
8849	if (!IS_USER_ADDRESS(userPath))
8850		return B_BAD_ADDRESS;
8851
8852	KPath pathBuffer;
8853	if (pathBuffer.InitCheck() != B_OK)
8854		return B_NO_MEMORY;
8855
8856	char* path = pathBuffer.LockBuffer();
8857
8858	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
8859	if (status != B_OK)
8860		return status;
8861
8862	return fs_unmount(path, -1, flags & ~B_UNMOUNT_BUSY_PARTITION, false);
8863}
8864
8865
8866status_t
8867_user_read_fs_info(dev_t device, struct fs_info* userInfo)
8868{
8869	struct fs_info info;
8870	status_t status;
8871
8872	if (userInfo == NULL)
8873		return B_BAD_VALUE;
8874
8875	if (!IS_USER_ADDRESS(userInfo))
8876		return B_BAD_ADDRESS;
8877
8878	status = fs_read_info(device, &info);
8879	if (status != B_OK)
8880		return status;
8881
8882	if (user_memcpy(userInfo, &info, sizeof(struct fs_info)) != B_OK)
8883		return B_BAD_ADDRESS;
8884
8885	return B_OK;
8886}
8887
8888
8889status_t
8890_user_write_fs_info(dev_t device, const struct fs_info* userInfo, int mask)
8891{
8892	struct fs_info info;
8893
8894	if (userInfo == NULL)
8895		return B_BAD_VALUE;
8896
8897	if (!IS_USER_ADDRESS(userInfo)
8898		|| user_memcpy(&info, userInfo, sizeof(struct fs_info)) != B_OK)
8899		return B_BAD_ADDRESS;
8900
8901	return fs_write_info(device, &info, mask);
8902}
8903
8904
8905dev_t
8906_user_next_device(int32* _userCookie)
8907{
8908	int32 cookie;
8909	dev_t device;
8910
8911	if (!IS_USER_ADDRESS(_userCookie)
8912		|| user_memcpy(&cookie, _userCookie, sizeof(int32)) != B_OK)
8913		return B_BAD_ADDRESS;
8914
8915	device = fs_next_device(&cookie);
8916
8917	if (device >= B_OK) {
8918		// update user cookie
8919		if (user_memcpy(_userCookie, &cookie, sizeof(int32)) != B_OK)
8920			return B_BAD_ADDRESS;
8921	}
8922
8923	return device;
8924}
8925
8926
8927status_t
8928_user_sync(void)
8929{
8930	return _kern_sync();
8931}
8932
8933
8934status_t
8935_user_get_next_fd_info(team_id team, uint32* userCookie, fd_info* userInfo,
8936	size_t infoSize)
8937{
8938	struct fd_info info;
8939	uint32 cookie;
8940
8941	// only root can do this
8942	if (geteuid() != 0)
8943		return B_NOT_ALLOWED;
8944
8945	if (infoSize != sizeof(fd_info))
8946		return B_BAD_VALUE;
8947
8948	if (!IS_USER_ADDRESS(userCookie) || !IS_USER_ADDRESS(userInfo)
8949		|| user_memcpy(&cookie, userCookie, sizeof(uint32)) != B_OK)
8950		return B_BAD_ADDRESS;
8951
8952	status_t status = _kern_get_next_fd_info(team, &cookie, &info, infoSize);
8953	if (status != B_OK)
8954		return status;
8955
8956	if (user_memcpy(userCookie, &cookie, sizeof(uint32)) != B_OK
8957		|| user_memcpy(userInfo, &info, infoSize) != B_OK)
8958		return B_BAD_ADDRESS;
8959
8960	return status;
8961}
8962
8963
8964status_t
8965_user_entry_ref_to_path(dev_t device, ino_t inode, const char* leaf,
8966	char* userPath, size_t pathLength)
8967{
8968	if (!IS_USER_ADDRESS(userPath))
8969		return B_BAD_ADDRESS;
8970
8971	KPath path;
8972	if (path.InitCheck() != B_OK)
8973		return B_NO_MEMORY;
8974
8975	// copy the leaf name onto the stack
8976	char stackLeaf[B_FILE_NAME_LENGTH];
8977	if (leaf != NULL) {
8978		if (!IS_USER_ADDRESS(leaf))
8979			return B_BAD_ADDRESS;
8980
8981		int status = user_copy_name(stackLeaf, leaf, B_FILE_NAME_LENGTH);
8982		if (status != B_OK)
8983			return status;
8984
8985		leaf = stackLeaf;
8986	}
8987
8988	status_t status = vfs_entry_ref_to_path(device, inode, leaf,
8989		false, path.LockBuffer(), path.BufferSize());
8990	if (status != B_OK)
8991		return status;
8992
8993	path.UnlockBuffer();
8994
8995	int length = user_strlcpy(userPath, path.Path(), pathLength);
8996	if (length < 0)
8997		return length;
8998	if (length >= (int)pathLength)
8999		return B_BUFFER_OVERFLOW;
9000
9001	return B_OK;
9002}
9003
9004
9005status_t
9006_user_normalize_path(const char* userPath, bool traverseLink, char* buffer)
9007{
9008	if (userPath == NULL || buffer == NULL)
9009		return B_BAD_VALUE;
9010	if (!IS_USER_ADDRESS(userPath) || !IS_USER_ADDRESS(buffer))
9011		return B_BAD_ADDRESS;
9012
9013	// copy path from userland
9014	KPath pathBuffer;
9015	if (pathBuffer.InitCheck() != B_OK)
9016		return B_NO_MEMORY;
9017	char* path = pathBuffer.LockBuffer();
9018
9019	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9020	if (status != B_OK)
9021		return status;
9022
9023	status_t error = normalize_path(path, pathBuffer.BufferSize(), traverseLink,
9024		false);
9025	if (error != B_OK)
9026		return error;
9027
9028	// copy back to userland
9029	int len = user_strlcpy(buffer, path, B_PATH_NAME_LENGTH);
9030	if (len < 0)
9031		return len;
9032	if (len >= B_PATH_NAME_LENGTH)
9033		return B_BUFFER_OVERFLOW;
9034
9035	return B_OK;
9036}
9037
9038
9039int
9040_user_open_entry_ref(dev_t device, ino_t inode, const char* userName,
9041	int openMode, int perms)
9042{
9043	char name[B_FILE_NAME_LENGTH];
9044
9045	if (userName == NULL || device < 0 || inode < 0)
9046		return B_BAD_VALUE;
9047	if (!IS_USER_ADDRESS(userName))
9048		return B_BAD_ADDRESS;
9049	status_t status = user_copy_name(name, userName, sizeof(name));
9050	if (status != B_OK)
9051		return status;
9052
9053	if ((openMode & O_CREAT) != 0) {
9054		return file_create_entry_ref(device, inode, name, openMode, perms,
9055			false);
9056	}
9057
9058	return file_open_entry_ref(device, inode, name, openMode, false);
9059}
9060
9061
9062int
9063_user_open(int fd, const char* userPath, int openMode, int perms)
9064{
9065	KPath path;
9066	if (path.InitCheck() != B_OK)
9067		return B_NO_MEMORY;
9068
9069	char* buffer = path.LockBuffer();
9070
9071	if (!IS_USER_ADDRESS(userPath))
9072		return B_BAD_ADDRESS;
9073	status_t status = user_copy_name(buffer, userPath, B_PATH_NAME_LENGTH);
9074	if (status != B_OK)
9075		return status;
9076
9077	if ((openMode & O_CREAT) != 0)
9078		return file_create(fd, buffer, openMode, perms, false);
9079
9080	return file_open(fd, buffer, openMode, false);
9081}
9082
9083
9084int
9085_user_open_dir_entry_ref(dev_t device, ino_t inode, const char* userName)
9086{
9087	if (userName != NULL) {
9088		char name[B_FILE_NAME_LENGTH];
9089
9090		if (!IS_USER_ADDRESS(userName))
9091			return B_BAD_ADDRESS;
9092		status_t status = user_copy_name(name, userName, sizeof(name));
9093		if (status != B_OK)
9094			return status;
9095
9096		return dir_open_entry_ref(device, inode, name, false);
9097	}
9098	return dir_open_entry_ref(device, inode, NULL, false);
9099}
9100
9101
9102int
9103_user_open_dir(int fd, const char* userPath)
9104{
9105	if (userPath == NULL)
9106		return dir_open(fd, NULL, false);
9107
9108	KPath path;
9109	if (path.InitCheck() != B_OK)
9110		return B_NO_MEMORY;
9111
9112	char* buffer = path.LockBuffer();
9113
9114	if (!IS_USER_ADDRESS(userPath))
9115		return B_BAD_ADDRESS;
9116	status_t status = user_copy_name(buffer, userPath, B_PATH_NAME_LENGTH);
9117	if (status != B_OK)
9118		return status;
9119
9120	return dir_open(fd, buffer, false);
9121}
9122
9123
9124/*!	\brief Opens a directory's parent directory and returns the entry name
9125		   of the former.
9126
9127	Aside from that it returns the directory's entry name, this method is
9128	equivalent to \code _user_open_dir(fd, "..") \endcode. It really is
9129	equivalent, if \a userName is \c NULL.
9130
9131	If a name buffer is supplied and the name does not fit the buffer, the
9132	function fails. A buffer of size \c B_FILE_NAME_LENGTH should be safe.
9133
9134	\param fd A FD referring to a directory.
9135	\param userName Buffer the directory's entry name shall be written into.
9136		   May be \c NULL.
9137	\param nameLength Size of the name buffer.
9138	\return The file descriptor of the opened parent directory, if everything
9139			went fine, an error code otherwise.
9140*/
9141int
9142_user_open_parent_dir(int fd, char* userName, size_t nameLength)
9143{
9144	bool kernel = false;
9145
9146	if (userName && !IS_USER_ADDRESS(userName))
9147		return B_BAD_ADDRESS;
9148
9149	// open the parent dir
9150	int parentFD = dir_open(fd, (char*)"..", kernel);
9151	if (parentFD < 0)
9152		return parentFD;
9153	FDCloser fdCloser(parentFD, kernel);
9154
9155	if (userName) {
9156		// get the vnodes
9157		struct vnode* parentVNode = get_vnode_from_fd(parentFD, kernel);
9158		struct vnode* dirVNode = get_vnode_from_fd(fd, kernel);
9159		VnodePutter parentVNodePutter(parentVNode);
9160		VnodePutter dirVNodePutter(dirVNode);
9161		if (!parentVNode || !dirVNode)
9162			return B_FILE_ERROR;
9163
9164		// get the vnode name
9165		char _buffer[offsetof(struct dirent, d_name) + B_FILE_NAME_LENGTH + 1];
9166		struct dirent* buffer = (struct dirent*)_buffer;
9167		status_t status = get_vnode_name(dirVNode, parentVNode, buffer,
9168			sizeof(_buffer), get_current_io_context(false));
9169		if (status != B_OK)
9170			return status;
9171
9172		// copy the name to the userland buffer
9173		int len = user_strlcpy(userName, buffer->d_name, nameLength);
9174		if (len < 0)
9175			return len;
9176		if (len >= (int)nameLength)
9177			return B_BUFFER_OVERFLOW;
9178	}
9179
9180	return fdCloser.Detach();
9181}
9182
9183
9184status_t
9185_user_fcntl(int fd, int op, size_t argument)
9186{
9187	status_t status = common_fcntl(fd, op, argument, false);
9188	if (op == F_SETLKW)
9189		syscall_restart_handle_post(status);
9190
9191	return status;
9192}
9193
9194
9195status_t
9196_user_fsync(int fd)
9197{
9198	return common_sync(fd, false);
9199}
9200
9201
9202status_t
9203_user_flock(int fd, int operation)
9204{
9205	FUNCTION(("_user_fcntl(fd = %d, op = %d)\n", fd, operation));
9206
9207	// Check if the operation is valid
9208	switch (operation & ~LOCK_NB) {
9209		case LOCK_UN:
9210		case LOCK_SH:
9211		case LOCK_EX:
9212			break;
9213
9214		default:
9215			return B_BAD_VALUE;
9216	}
9217
9218	struct vnode* vnode;
9219	FileDescriptorPutter descriptor(get_fd_and_vnode(fd, &vnode, false));
9220	if (!descriptor.IsSet())
9221		return B_FILE_ERROR;
9222
9223	if (descriptor->type != FDTYPE_FILE)
9224		return B_BAD_VALUE;
9225
9226	struct flock flock;
9227	flock.l_start = 0;
9228	flock.l_len = OFF_MAX;
9229	flock.l_whence = 0;
9230	flock.l_type = (operation & LOCK_SH) != 0 ? F_RDLCK : F_WRLCK;
9231
9232	status_t status;
9233	if ((operation & LOCK_UN) != 0) {
9234		if (HAS_FS_CALL(vnode, release_lock))
9235			status = FS_CALL(vnode, release_lock, descriptor->cookie, &flock);
9236		else
9237			status = release_advisory_lock(vnode, NULL, descriptor.Get(), &flock);
9238	} else {
9239		if (HAS_FS_CALL(vnode, acquire_lock)) {
9240			status = FS_CALL(vnode, acquire_lock, descriptor->cookie, &flock,
9241				(operation & LOCK_NB) == 0);
9242		} else {
9243			status = acquire_advisory_lock(vnode, NULL, descriptor.Get(), &flock,
9244				(operation & LOCK_NB) == 0);
9245		}
9246	}
9247
9248	syscall_restart_handle_post(status);
9249
9250	return status;
9251}
9252
9253
9254status_t
9255_user_lock_node(int fd)
9256{
9257	return common_lock_node(fd, false);
9258}
9259
9260
9261status_t
9262_user_unlock_node(int fd)
9263{
9264	return common_unlock_node(fd, false);
9265}
9266
9267
9268status_t
9269_user_preallocate(int fd, off_t offset, off_t length)
9270{
9271	return common_preallocate(fd, offset, length, false);
9272}
9273
9274
9275status_t
9276_user_create_dir_entry_ref(dev_t device, ino_t inode, const char* userName,
9277	int perms)
9278{
9279	char name[B_FILE_NAME_LENGTH];
9280	status_t status;
9281
9282	if (!IS_USER_ADDRESS(userName))
9283		return B_BAD_ADDRESS;
9284
9285	status = user_copy_name(name, userName, sizeof(name));
9286	if (status != B_OK)
9287		return status;
9288
9289	return dir_create_entry_ref(device, inode, name, perms, false);
9290}
9291
9292
9293status_t
9294_user_create_dir(int fd, const char* userPath, int perms)
9295{
9296	KPath pathBuffer;
9297	if (pathBuffer.InitCheck() != B_OK)
9298		return B_NO_MEMORY;
9299
9300	char* path = pathBuffer.LockBuffer();
9301
9302	if (!IS_USER_ADDRESS(userPath))
9303		return B_BAD_ADDRESS;
9304	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9305	if (status != B_OK)
9306		return status;
9307
9308	return dir_create(fd, path, perms, false);
9309}
9310
9311
9312status_t
9313_user_remove_dir(int fd, const char* userPath)
9314{
9315	KPath pathBuffer;
9316	if (pathBuffer.InitCheck() != B_OK)
9317		return B_NO_MEMORY;
9318
9319	char* path = pathBuffer.LockBuffer();
9320
9321	if (userPath != NULL) {
9322		if (!IS_USER_ADDRESS(userPath))
9323			return B_BAD_ADDRESS;
9324		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9325		if (status != B_OK)
9326			return status;
9327	}
9328
9329	return dir_remove(fd, userPath ? path : NULL, false);
9330}
9331
9332
9333status_t
9334_user_read_link(int fd, const char* userPath, char* userBuffer,
9335	size_t* userBufferSize)
9336{
9337	KPath pathBuffer, linkBuffer;
9338	if (pathBuffer.InitCheck() != B_OK || linkBuffer.InitCheck() != B_OK)
9339		return B_NO_MEMORY;
9340
9341	size_t bufferSize;
9342
9343	if (!IS_USER_ADDRESS(userBuffer) || !IS_USER_ADDRESS(userBufferSize)
9344		|| user_memcpy(&bufferSize, userBufferSize, sizeof(size_t)) != B_OK)
9345		return B_BAD_ADDRESS;
9346
9347	char* path = pathBuffer.LockBuffer();
9348	char* buffer = linkBuffer.LockBuffer();
9349
9350	if (userPath) {
9351		if (!IS_USER_ADDRESS(userPath))
9352			return B_BAD_ADDRESS;
9353		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9354		if (status != B_OK)
9355			return status;
9356
9357		if (bufferSize > B_PATH_NAME_LENGTH)
9358			bufferSize = B_PATH_NAME_LENGTH;
9359	}
9360
9361	size_t newBufferSize = bufferSize;
9362	status_t status = common_read_link(fd, userPath ? path : NULL, buffer,
9363		&newBufferSize, false);
9364
9365	// we also update the bufferSize in case of errors
9366	// (the real length will be returned in case of B_BUFFER_OVERFLOW)
9367	if (user_memcpy(userBufferSize, &newBufferSize, sizeof(size_t)) != B_OK)
9368		return B_BAD_ADDRESS;
9369
9370	if (status != B_OK)
9371		return status;
9372
9373	bufferSize = min_c(newBufferSize, bufferSize);
9374	if (user_memcpy(userBuffer, buffer, bufferSize) != B_OK)
9375		return B_BAD_ADDRESS;
9376
9377	return B_OK;
9378}
9379
9380
9381status_t
9382_user_create_symlink(int fd, const char* userPath, const char* userToPath,
9383	int mode)
9384{
9385	KPath pathBuffer;
9386	KPath toPathBuffer;
9387	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
9388		return B_NO_MEMORY;
9389
9390	char* path = pathBuffer.LockBuffer();
9391	char* toPath = toPathBuffer.LockBuffer();
9392
9393	if (!IS_USER_ADDRESS(userPath) || !IS_USER_ADDRESS(userToPath))
9394		return B_BAD_ADDRESS;
9395	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9396	if (status != B_OK)
9397		return status;
9398	status = user_copy_name(toPath, userToPath, B_PATH_NAME_LENGTH);
9399	if (status != B_OK)
9400		return status;
9401
9402	return common_create_symlink(fd, path, toPath, mode, false);
9403}
9404
9405
9406status_t
9407_user_create_link(int pathFD, const char* userPath, int toFD,
9408	const char* userToPath, bool traverseLeafLink)
9409{
9410	KPath pathBuffer;
9411	KPath toPathBuffer;
9412	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
9413		return B_NO_MEMORY;
9414
9415	char* path = pathBuffer.LockBuffer();
9416	char* toPath = toPathBuffer.LockBuffer();
9417
9418	if (!IS_USER_ADDRESS(userPath) || !IS_USER_ADDRESS(userToPath))
9419		return B_BAD_ADDRESS;
9420	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9421	if (status != B_OK)
9422		return status;
9423	status = user_copy_name(toPath, userToPath, B_PATH_NAME_LENGTH);
9424	if (status != B_OK)
9425		return status;
9426
9427	status = check_path(toPath);
9428	if (status != B_OK)
9429		return status;
9430
9431	return common_create_link(pathFD, path, toFD, toPath, traverseLeafLink,
9432		false);
9433}
9434
9435
9436status_t
9437_user_unlink(int fd, const char* userPath)
9438{
9439	KPath pathBuffer;
9440	if (pathBuffer.InitCheck() != B_OK)
9441		return B_NO_MEMORY;
9442
9443	char* path = pathBuffer.LockBuffer();
9444
9445	if (!IS_USER_ADDRESS(userPath))
9446		return B_BAD_ADDRESS;
9447	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9448	if (status != B_OK)
9449		return status;
9450
9451	return common_unlink(fd, path, false);
9452}
9453
9454
9455status_t
9456_user_rename(int oldFD, const char* userOldPath, int newFD,
9457	const char* userNewPath)
9458{
9459	KPath oldPathBuffer;
9460	KPath newPathBuffer;
9461	if (oldPathBuffer.InitCheck() != B_OK || newPathBuffer.InitCheck() != B_OK)
9462		return B_NO_MEMORY;
9463
9464	char* oldPath = oldPathBuffer.LockBuffer();
9465	char* newPath = newPathBuffer.LockBuffer();
9466
9467	if (!IS_USER_ADDRESS(userOldPath) || !IS_USER_ADDRESS(userNewPath))
9468		return B_BAD_ADDRESS;
9469	status_t status = user_copy_name(oldPath, userOldPath, B_PATH_NAME_LENGTH);
9470	if (status != B_OK)
9471		return status;
9472	status = user_copy_name(newPath, userNewPath, B_PATH_NAME_LENGTH);
9473	if (status != B_OK)
9474		return status;
9475
9476	return common_rename(oldFD, oldPath, newFD, newPath, false);
9477}
9478
9479
9480status_t
9481_user_create_fifo(int fd, const char* userPath, mode_t perms)
9482{
9483	KPath pathBuffer;
9484	if (pathBuffer.InitCheck() != B_OK)
9485		return B_NO_MEMORY;
9486
9487	char* path = pathBuffer.LockBuffer();
9488
9489	if (!IS_USER_ADDRESS(userPath))
9490		return B_BAD_ADDRESS;
9491	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9492	if (status != B_OK)
9493		return status;
9494
9495	// split into directory vnode and filename path
9496	char filename[B_FILE_NAME_LENGTH];
9497	VnodePutter dir;
9498	status = fd_and_path_to_dir_vnode(fd, path, dir, filename, false);
9499	if (status != B_OK)
9500		return status;
9501
9502	// the underlying FS needs to support creating FIFOs
9503	if (!HAS_FS_CALL(dir, create_special_node))
9504		return B_UNSUPPORTED;
9505
9506	// create the entry	-- the FIFO sub node is set up automatically
9507	fs_vnode superVnode;
9508	ino_t nodeID;
9509	status = FS_CALL(dir.Get(), create_special_node, filename, NULL,
9510		S_IFIFO | (perms & S_IUMSK), 0, &superVnode, &nodeID);
9511
9512	// create_special_node() acquired a reference for us that we don't need.
9513	if (status == B_OK)
9514		put_vnode(dir->mount->volume, nodeID);
9515
9516	return status;
9517}
9518
9519
9520status_t
9521_user_create_pipe(int* userFDs)
9522{
9523	// rootfs should support creating FIFOs, but let's be sure
9524	if (!HAS_FS_CALL(sRoot, create_special_node))
9525		return B_UNSUPPORTED;
9526
9527	// create the node	-- the FIFO sub node is set up automatically
9528	fs_vnode superVnode;
9529	ino_t nodeID;
9530	status_t status = FS_CALL(sRoot, create_special_node, NULL, NULL,
9531		S_IFIFO | S_IRUSR | S_IWUSR, 0, &superVnode, &nodeID);
9532	if (status != B_OK)
9533		return status;
9534
9535	// We've got one reference to the node and need another one.
9536	struct vnode* vnode;
9537	status = get_vnode(sRoot->mount->id, nodeID, &vnode, true, false);
9538	if (status != B_OK) {
9539		// that should not happen
9540		dprintf("_user_create_pipe(): Failed to lookup vnode (%" B_PRIdDEV ", "
9541			"%" B_PRIdINO ")\n", sRoot->mount->id, sRoot->id);
9542		return status;
9543	}
9544
9545	// Everything looks good so far. Open two FDs for reading respectively
9546	// writing.
9547	int fds[2];
9548	fds[0] = open_vnode(vnode, O_RDONLY, false);
9549	fds[1] = open_vnode(vnode, O_WRONLY, false);
9550
9551	FDCloser closer0(fds[0], false);
9552	FDCloser closer1(fds[1], false);
9553
9554	status = (fds[0] >= 0 ? (fds[1] >= 0 ? B_OK : fds[1]) : fds[0]);
9555
9556	// copy FDs to userland
9557	if (status == B_OK) {
9558		if (!IS_USER_ADDRESS(userFDs)
9559			|| user_memcpy(userFDs, fds, sizeof(fds)) != B_OK) {
9560			status = B_BAD_ADDRESS;
9561		}
9562	}
9563
9564	// keep FDs, if everything went fine
9565	if (status == B_OK) {
9566		closer0.Detach();
9567		closer1.Detach();
9568	}
9569
9570	return status;
9571}
9572
9573
9574status_t
9575_user_access(int fd, const char* userPath, int mode, bool effectiveUserGroup)
9576{
9577	KPath pathBuffer;
9578	if (pathBuffer.InitCheck() != B_OK)
9579		return B_NO_MEMORY;
9580
9581	char* path = pathBuffer.LockBuffer();
9582
9583	if (!IS_USER_ADDRESS(userPath))
9584		return B_BAD_ADDRESS;
9585	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9586	if (status != B_OK)
9587		return status;
9588
9589	return common_access(fd, path, mode, effectiveUserGroup, false);
9590}
9591
9592
9593status_t
9594_user_read_stat(int fd, const char* userPath, bool traverseLink,
9595	struct stat* userStat, size_t statSize)
9596{
9597	struct stat stat = {0};
9598	status_t status;
9599
9600	if (statSize > sizeof(struct stat))
9601		return B_BAD_VALUE;
9602
9603	if (!IS_USER_ADDRESS(userStat))
9604		return B_BAD_ADDRESS;
9605
9606	if (userPath != NULL) {
9607		// path given: get the stat of the node referred to by (fd, path)
9608		if (!IS_USER_ADDRESS(userPath))
9609			return B_BAD_ADDRESS;
9610
9611		KPath pathBuffer;
9612		if (pathBuffer.InitCheck() != B_OK)
9613			return B_NO_MEMORY;
9614
9615		char* path = pathBuffer.LockBuffer();
9616
9617		status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9618		if (status != B_OK)
9619			return status;
9620
9621		status = common_path_read_stat(fd, path, traverseLink, &stat, false);
9622	} else {
9623		// no path given: get the FD and use the FD operation
9624		FileDescriptorPutter descriptor
9625			(get_fd(get_current_io_context(false), fd));
9626		if (!descriptor.IsSet())
9627			return B_FILE_ERROR;
9628
9629		if (descriptor->ops->fd_read_stat)
9630			status = descriptor->ops->fd_read_stat(descriptor.Get(), &stat);
9631		else
9632			status = B_UNSUPPORTED;
9633	}
9634
9635	if (status != B_OK)
9636		return status;
9637
9638	return user_memcpy(userStat, &stat, statSize);
9639}
9640
9641
9642status_t
9643_user_write_stat(int fd, const char* userPath, bool traverseLeafLink,
9644	const struct stat* userStat, size_t statSize, int statMask)
9645{
9646	if (statSize > sizeof(struct stat))
9647		return B_BAD_VALUE;
9648
9649	struct stat stat;
9650
9651	if (!IS_USER_ADDRESS(userStat)
9652		|| user_memcpy(&stat, userStat, statSize) < B_OK)
9653		return B_BAD_ADDRESS;
9654
9655	// clear additional stat fields
9656	if (statSize < sizeof(struct stat))
9657		memset((uint8*)&stat + statSize, 0, sizeof(struct stat) - statSize);
9658
9659	status_t status;
9660
9661	if (userPath != NULL) {
9662		// path given: write the stat of the node referred to by (fd, path)
9663		if (!IS_USER_ADDRESS(userPath))
9664			return B_BAD_ADDRESS;
9665
9666		KPath pathBuffer;
9667		if (pathBuffer.InitCheck() != B_OK)
9668			return B_NO_MEMORY;
9669
9670		char* path = pathBuffer.LockBuffer();
9671
9672		status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9673		if (status != B_OK)
9674			return status;
9675
9676		status = common_path_write_stat(fd, path, traverseLeafLink, &stat,
9677			statMask, false);
9678	} else {
9679		// no path given: get the FD and use the FD operation
9680		FileDescriptorPutter descriptor
9681			(get_fd(get_current_io_context(false), fd));
9682		if (!descriptor.IsSet())
9683			return B_FILE_ERROR;
9684
9685		if (descriptor->ops->fd_write_stat) {
9686			status = descriptor->ops->fd_write_stat(descriptor.Get(), &stat,
9687				statMask);
9688		} else
9689			status = B_UNSUPPORTED;
9690	}
9691
9692	return status;
9693}
9694
9695
9696int
9697_user_open_attr_dir(int fd, const char* userPath, bool traverseLeafLink)
9698{
9699	KPath pathBuffer;
9700	if (pathBuffer.InitCheck() != B_OK)
9701		return B_NO_MEMORY;
9702
9703	char* path = pathBuffer.LockBuffer();
9704
9705	if (userPath != NULL) {
9706		if (!IS_USER_ADDRESS(userPath))
9707			return B_BAD_ADDRESS;
9708		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9709		if (status != B_OK)
9710			return status;
9711	}
9712
9713	return attr_dir_open(fd, userPath ? path : NULL, traverseLeafLink, false);
9714}
9715
9716
9717ssize_t
9718_user_read_attr(int fd, const char* userAttribute, off_t pos, void* userBuffer,
9719	size_t readBytes)
9720{
9721	char attribute[B_FILE_NAME_LENGTH];
9722
9723	if (userAttribute == NULL)
9724		return B_BAD_VALUE;
9725	if (!IS_USER_ADDRESS(userAttribute))
9726		return B_BAD_ADDRESS;
9727	status_t status = user_copy_name(attribute, userAttribute, sizeof(attribute));
9728	if (status != B_OK)
9729		return status;
9730
9731	int attr = attr_open(fd, NULL, attribute, O_RDONLY, false);
9732	if (attr < 0)
9733		return attr;
9734
9735	ssize_t bytes = _user_read(attr, pos, userBuffer, readBytes);
9736	_user_close(attr);
9737
9738	return bytes;
9739}
9740
9741
9742ssize_t
9743_user_write_attr(int fd, const char* userAttribute, uint32 type, off_t pos,
9744	const void* buffer, size_t writeBytes)
9745{
9746	char attribute[B_FILE_NAME_LENGTH];
9747
9748	if (userAttribute == NULL)
9749		return B_BAD_VALUE;
9750	if (!IS_USER_ADDRESS(userAttribute))
9751		return B_BAD_ADDRESS;
9752	status_t status = user_copy_name(attribute, userAttribute, sizeof(attribute));
9753	if (status != B_OK)
9754		return status;
9755
9756	// Try to support the BeOS typical truncation as well as the position
9757	// argument
9758	int attr = attr_create(fd, NULL, attribute, type,
9759		O_CREAT | O_WRONLY | (pos != 0 ? 0 : O_TRUNC), false);
9760	if (attr < 0)
9761		return attr;
9762
9763	ssize_t bytes = _user_write(attr, pos, buffer, writeBytes);
9764	_user_close(attr);
9765
9766	return bytes;
9767}
9768
9769
9770status_t
9771_user_stat_attr(int fd, const char* userAttribute,
9772	struct attr_info* userAttrInfo)
9773{
9774	char attribute[B_FILE_NAME_LENGTH];
9775
9776	if (userAttribute == NULL || userAttrInfo == NULL)
9777		return B_BAD_VALUE;
9778	if (!IS_USER_ADDRESS(userAttribute) || !IS_USER_ADDRESS(userAttrInfo))
9779		return B_BAD_ADDRESS;
9780	status_t status = user_copy_name(attribute, userAttribute,
9781		sizeof(attribute));
9782	if (status != B_OK)
9783		return status;
9784
9785	int attr = attr_open(fd, NULL, attribute, O_RDONLY, false);
9786	if (attr < 0)
9787		return attr;
9788
9789	struct file_descriptor* descriptor
9790		= get_fd(get_current_io_context(false), attr);
9791	if (descriptor == NULL) {
9792		_user_close(attr);
9793		return B_FILE_ERROR;
9794	}
9795
9796	struct stat stat;
9797	if (descriptor->ops->fd_read_stat)
9798		status = descriptor->ops->fd_read_stat(descriptor, &stat);
9799	else
9800		status = B_UNSUPPORTED;
9801
9802	put_fd(descriptor);
9803	_user_close(attr);
9804
9805	if (status == B_OK) {
9806		attr_info info;
9807		info.type = stat.st_type;
9808		info.size = stat.st_size;
9809
9810		if (user_memcpy(userAttrInfo, &info, sizeof(struct attr_info)) != B_OK)
9811			return B_BAD_ADDRESS;
9812	}
9813
9814	return status;
9815}
9816
9817
9818int
9819_user_open_attr(int fd, const char* userPath, const char* userName,
9820	uint32 type, int openMode)
9821{
9822	char name[B_FILE_NAME_LENGTH];
9823
9824	if (!IS_USER_ADDRESS(userName))
9825		return B_BAD_ADDRESS;
9826	status_t status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
9827	if (status != B_OK)
9828		return status;
9829
9830	KPath pathBuffer;
9831	if (pathBuffer.InitCheck() != B_OK)
9832		return B_NO_MEMORY;
9833
9834	char* path = pathBuffer.LockBuffer();
9835
9836	if (userPath != NULL) {
9837		if (!IS_USER_ADDRESS(userPath))
9838			return B_BAD_ADDRESS;
9839		status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9840		if (status != B_OK)
9841			return status;
9842	}
9843
9844	if ((openMode & O_CREAT) != 0) {
9845		return attr_create(fd, userPath ? path : NULL, name, type, openMode,
9846			false);
9847	}
9848
9849	return attr_open(fd, userPath ? path : NULL, name, openMode, false);
9850}
9851
9852
9853status_t
9854_user_remove_attr(int fd, const char* userName)
9855{
9856	char name[B_FILE_NAME_LENGTH];
9857
9858	if (!IS_USER_ADDRESS(userName))
9859		return B_BAD_ADDRESS;
9860	status_t status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
9861	if (status != B_OK)
9862		return status;
9863
9864	return attr_remove(fd, name, false);
9865}
9866
9867
9868status_t
9869_user_rename_attr(int fromFile, const char* userFromName, int toFile,
9870	const char* userToName)
9871{
9872	if (!IS_USER_ADDRESS(userFromName)
9873		|| !IS_USER_ADDRESS(userToName))
9874		return B_BAD_ADDRESS;
9875
9876	KPath fromNameBuffer(B_FILE_NAME_LENGTH);
9877	KPath toNameBuffer(B_FILE_NAME_LENGTH);
9878	if (fromNameBuffer.InitCheck() != B_OK || toNameBuffer.InitCheck() != B_OK)
9879		return B_NO_MEMORY;
9880
9881	char* fromName = fromNameBuffer.LockBuffer();
9882	char* toName = toNameBuffer.LockBuffer();
9883
9884	status_t status = user_copy_name(fromName, userFromName, B_FILE_NAME_LENGTH);
9885	if (status != B_OK)
9886		return status;
9887	status = user_copy_name(toName, userToName, B_FILE_NAME_LENGTH);
9888	if (status != B_OK)
9889		return status;
9890
9891	return attr_rename(fromFile, fromName, toFile, toName, false);
9892}
9893
9894
9895int
9896_user_open_index_dir(dev_t device)
9897{
9898	return index_dir_open(device, false);
9899}
9900
9901
9902status_t
9903_user_create_index(dev_t device, const char* userName, uint32 type,
9904	uint32 flags)
9905{
9906	char name[B_FILE_NAME_LENGTH];
9907
9908	if (!IS_USER_ADDRESS(userName))
9909		return B_BAD_ADDRESS;
9910	status_t status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
9911	if (status != B_OK)
9912		return status;
9913
9914	return index_create(device, name, type, flags, false);
9915}
9916
9917
9918status_t
9919_user_read_index_stat(dev_t device, const char* userName, struct stat* userStat)
9920{
9921	char name[B_FILE_NAME_LENGTH];
9922	struct stat stat = {0};
9923	status_t status;
9924
9925	if (!IS_USER_ADDRESS(userName) || !IS_USER_ADDRESS(userStat))
9926		return B_BAD_ADDRESS;
9927	status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
9928	if (status != B_OK)
9929		return status;
9930
9931	status = index_name_read_stat(device, name, &stat, false);
9932	if (status == B_OK) {
9933		if (user_memcpy(userStat, &stat, sizeof(stat)) != B_OK)
9934			return B_BAD_ADDRESS;
9935	}
9936
9937	return status;
9938}
9939
9940
9941status_t
9942_user_remove_index(dev_t device, const char* userName)
9943{
9944	char name[B_FILE_NAME_LENGTH];
9945
9946	if (!IS_USER_ADDRESS(userName))
9947		return B_BAD_ADDRESS;
9948	status_t status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
9949	if (status != B_OK)
9950		return status;
9951
9952	return index_remove(device, name, false);
9953}
9954
9955
9956status_t
9957_user_getcwd(char* userBuffer, size_t size)
9958{
9959	if (size == 0)
9960		return B_BAD_VALUE;
9961	if (!IS_USER_ADDRESS(userBuffer))
9962		return B_BAD_ADDRESS;
9963
9964	if (size > kMaxPathLength)
9965		size = kMaxPathLength;
9966
9967	KPath pathBuffer(size);
9968	if (pathBuffer.InitCheck() != B_OK)
9969		return B_NO_MEMORY;
9970
9971	TRACE(("user_getcwd: buf %p, %ld\n", userBuffer, size));
9972
9973	char* path = pathBuffer.LockBuffer();
9974
9975	status_t status = get_cwd(path, size, false);
9976	if (status != B_OK)
9977		return status;
9978
9979	// Copy back the result
9980	if (user_strlcpy(userBuffer, path, size) < B_OK)
9981		return B_BAD_ADDRESS;
9982
9983	return status;
9984}
9985
9986
9987status_t
9988_user_setcwd(int fd, const char* userPath)
9989{
9990	TRACE(("user_setcwd: path = %p\n", userPath));
9991
9992	KPath pathBuffer;
9993	if (pathBuffer.InitCheck() != B_OK)
9994		return B_NO_MEMORY;
9995
9996	char* path = pathBuffer.LockBuffer();
9997
9998	if (userPath != NULL) {
9999		if (!IS_USER_ADDRESS(userPath))
10000			return B_BAD_ADDRESS;
10001		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
10002		if (status != B_OK)
10003			return status;
10004	}
10005
10006	return set_cwd(fd, userPath != NULL ? path : NULL, false);
10007}
10008
10009
10010status_t
10011_user_change_root(const char* userPath)
10012{
10013	// only root is allowed to chroot()
10014	if (geteuid() != 0)
10015		return B_NOT_ALLOWED;
10016
10017	// alloc path buffer
10018	KPath pathBuffer;
10019	if (pathBuffer.InitCheck() != B_OK)
10020		return B_NO_MEMORY;
10021
10022	// copy userland path to kernel
10023	char* path = pathBuffer.LockBuffer();
10024	if (userPath != NULL) {
10025		if (!IS_USER_ADDRESS(userPath))
10026			return B_BAD_ADDRESS;
10027		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
10028		if (status != B_OK)
10029			return status;
10030	}
10031
10032	// get the vnode
10033	VnodePutter vnode;
10034	status_t status = path_to_vnode(path, true, vnode, NULL, false);
10035	if (status != B_OK)
10036		return status;
10037
10038	// set the new root
10039	struct io_context* context = get_current_io_context(false);
10040	mutex_lock(&sIOContextRootLock);
10041	struct vnode* oldRoot = context->root;
10042	context->root = vnode.Detach();
10043	mutex_unlock(&sIOContextRootLock);
10044
10045	put_vnode(oldRoot);
10046
10047	return B_OK;
10048}
10049
10050
10051int
10052_user_open_query(dev_t device, const char* userQuery, size_t queryLength,
10053	uint32 flags, port_id port, int32 token)
10054{
10055	if (device < 0 || userQuery == NULL || queryLength == 0)
10056		return B_BAD_VALUE;
10057
10058	if (!IS_USER_ADDRESS(userQuery))
10059		return B_BAD_ADDRESS;
10060
10061	// this is a safety restriction
10062	if (queryLength >= 65536)
10063		return B_NAME_TOO_LONG;
10064
10065	BStackOrHeapArray<char, 128> query(queryLength + 1);
10066	if (!query.IsValid())
10067		return B_NO_MEMORY;
10068
10069	if (user_strlcpy(query, userQuery, queryLength + 1) < B_OK)
10070		return B_BAD_ADDRESS;
10071
10072	return query_open(device, query, flags, port, token, false);
10073}
10074
10075
10076#include "vfs_request_io.cpp"
10077