1/*
2 * Copyright 2005-2011, Ingo Weinhold, ingo_weinhold@gmx.de.
3 * Copyright 2002-2011, Axel D��rfler, axeld@pinc-software.de.
4 * Distributed under the terms of the MIT License.
5 *
6 * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
7 * Distributed under the terms of the NewOS License.
8 */
9
10
11/*! Virtual File System and File System Interface Layer */
12
13
14#include <ctype.h>
15#include <fcntl.h>
16#include <limits.h>
17#include <stddef.h>
18#include <stdio.h>
19#include <string.h>
20#include <sys/file.h>
21#include <sys/resource.h>
22#include <sys/stat.h>
23#include <unistd.h>
24
25#include <fs_attr.h>
26#include <fs_info.h>
27#include <fs_interface.h>
28#include <fs_volume.h>
29#include <OS.h>
30#include <StorageDefs.h>
31
32#include <AutoDeleter.h>
33#include <block_cache.h>
34#include <boot/kernel_args.h>
35#include <debug_heap.h>
36#include <disk_device_manager/KDiskDevice.h>
37#include <disk_device_manager/KDiskDeviceManager.h>
38#include <disk_device_manager/KDiskDeviceUtils.h>
39#include <disk_device_manager/KDiskSystem.h>
40#include <fd.h>
41#include <file_cache.h>
42#include <fs/node_monitor.h>
43#include <khash.h>
44#include <KPath.h>
45#include <lock.h>
46#include <low_resource_manager.h>
47#include <syscalls.h>
48#include <syscall_restart.h>
49#include <tracing.h>
50#include <util/atomic.h>
51#include <util/AutoLock.h>
52#include <util/DoublyLinkedList.h>
53#include <vfs.h>
54#include <vm/vm.h>
55#include <vm/VMCache.h>
56
57#include "EntryCache.h"
58#include "fifo.h"
59#include "IORequest.h"
60#include "unused_vnodes.h"
61#include "vfs_tracing.h"
62#include "Vnode.h"
63#include "../cache/vnode_store.h"
64
65
66//#define TRACE_VFS
67#ifdef TRACE_VFS
68#	define TRACE(x) dprintf x
69#	define FUNCTION(x) dprintf x
70#else
71#	define TRACE(x) ;
72#	define FUNCTION(x) ;
73#endif
74
75#define ADD_DEBUGGER_COMMANDS
76
77
78#define HAS_FS_CALL(vnode, op)			(vnode->ops->op != NULL)
79#define HAS_FS_MOUNT_CALL(mount, op)	(mount->volume->ops->op != NULL)
80
81#if KDEBUG
82#	define FS_CALL(vnode, op, params...) \
83		( HAS_FS_CALL(vnode, op) ? \
84			vnode->ops->op(vnode->mount->volume, vnode, params) \
85			: (panic("FS_CALL op " #op " is NULL"), 0))
86#	define FS_CALL_NO_PARAMS(vnode, op) \
87		( HAS_FS_CALL(vnode, op) ? \
88			vnode->ops->op(vnode->mount->volume, vnode) \
89			: (panic("FS_CALL_NO_PARAMS op " #op " is NULL"), 0))
90#	define FS_MOUNT_CALL(mount, op, params...) \
91		( HAS_FS_MOUNT_CALL(mount, op) ? \
92			mount->volume->ops->op(mount->volume, params) \
93			: (panic("FS_MOUNT_CALL op " #op " is NULL"), 0))
94#	define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
95		( HAS_FS_MOUNT_CALL(mount, op) ? \
96			mount->volume->ops->op(mount->volume) \
97			: (panic("FS_MOUNT_CALL_NO_PARAMS op " #op " is NULL"), 0))
98#else
99#	define FS_CALL(vnode, op, params...) \
100			vnode->ops->op(vnode->mount->volume, vnode, params)
101#	define FS_CALL_NO_PARAMS(vnode, op) \
102			vnode->ops->op(vnode->mount->volume, vnode)
103#	define FS_MOUNT_CALL(mount, op, params...) \
104			mount->volume->ops->op(mount->volume, params)
105#	define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
106			mount->volume->ops->op(mount->volume)
107#endif
108
109
110const static size_t kMaxPathLength = 65536;
111	// The absolute maximum path length (for getcwd() - this is not depending
112	// on PATH_MAX
113
114
115struct vnode_hash_key {
116	dev_t	device;
117	ino_t	vnode;
118};
119
120typedef DoublyLinkedList<vnode> VnodeList;
121
122/*!	\brief Structure to manage a mounted file system
123
124	Note: The root_vnode and root_vnode->covers fields (what others?) are
125	initialized in fs_mount() and not changed afterwards. That is as soon
126	as the mount is mounted and it is made sure it won't be unmounted
127	(e.g. by holding a reference to a vnode of that mount) (read) access
128	to those fields is always safe, even without additional locking. Morever
129	while mounted the mount holds a reference to the root_vnode->covers vnode,
130	and thus making the access path vnode->mount->root_vnode->covers->mount->...
131	safe if a reference to vnode is held (note that for the root mount
132	root_vnode->covers is NULL, though).
133*/
134struct fs_mount {
135	fs_mount()
136		:
137		volume(NULL),
138		device_name(NULL)
139	{
140		recursive_lock_init(&rlock, "mount rlock");
141	}
142
143	~fs_mount()
144	{
145		recursive_lock_destroy(&rlock);
146		free(device_name);
147
148		while (volume) {
149			fs_volume* superVolume = volume->super_volume;
150
151			if (volume->file_system != NULL)
152				put_module(volume->file_system->info.name);
153
154			free(volume->file_system_name);
155			free(volume);
156			volume = superVolume;
157		}
158	}
159
160	struct fs_mount* next;
161	dev_t			id;
162	fs_volume*		volume;
163	char*			device_name;
164	recursive_lock	rlock;	// guards the vnodes list
165		// TODO: Make this a mutex! It is never used recursively.
166	struct vnode*	root_vnode;
167	struct vnode*	covers_vnode;	// immutable
168	KPartition*		partition;
169	VnodeList		vnodes;
170	EntryCache		entry_cache;
171	bool			unmounting;
172	bool			owns_file_device;
173};
174
175struct advisory_lock : public DoublyLinkedListLinkImpl<advisory_lock> {
176	list_link		link;
177	team_id			team;
178	pid_t			session;
179	off_t			start;
180	off_t			end;
181	bool			shared;
182};
183
184typedef DoublyLinkedList<advisory_lock> LockList;
185
186struct advisory_locking {
187	sem_id			lock;
188	sem_id			wait_sem;
189	LockList		locks;
190
191	advisory_locking()
192		:
193		lock(-1),
194		wait_sem(-1)
195	{
196	}
197
198	~advisory_locking()
199	{
200		if (lock >= 0)
201			delete_sem(lock);
202		if (wait_sem >= 0)
203			delete_sem(wait_sem);
204	}
205};
206
207/*!	\brief Guards sMountsTable.
208
209	The holder is allowed to read/write access the sMountsTable.
210	Manipulation of the fs_mount structures themselves
211	(and their destruction) requires different locks though.
212*/
213static mutex sMountMutex = MUTEX_INITIALIZER("vfs_mount_lock");
214
215/*!	\brief Guards mount/unmount operations.
216
217	The fs_mount() and fs_unmount() hold the lock during their whole operation.
218	That is locking the lock ensures that no FS is mounted/unmounted. In
219	particular this means that
220	- sMountsTable will not be modified,
221	- the fields immutable after initialization of the fs_mount structures in
222	  sMountsTable will not be modified,
223
224	The thread trying to lock the lock must not hold sVnodeLock or
225	sMountMutex.
226*/
227static recursive_lock sMountOpLock;
228
229/*!	\brief Guards sVnodeTable.
230
231	The holder is allowed read/write access to sVnodeTable and to
232	any unbusy vnode in that table, save to the immutable fields (device, id,
233	private_node, mount) to which only read-only access is allowed.
234	The mutable fields advisory_locking, mandatory_locked_by, and ref_count, as
235	well as the busy, removed, unused flags, and the vnode's type can also be
236	write accessed when holding a read lock to sVnodeLock *and* having the vnode
237	locked. Write access to covered_by and covers requires to write lock
238	sVnodeLock.
239
240	The thread trying to acquire the lock must not hold sMountMutex.
241	You must not hold this lock when calling create_sem(), as this might call
242	vfs_free_unused_vnodes() and thus cause a deadlock.
243*/
244static rw_lock sVnodeLock = RW_LOCK_INITIALIZER("vfs_vnode_lock");
245
246/*!	\brief Guards io_context::root.
247
248	Must be held when setting or getting the io_context::root field.
249	The only operation allowed while holding this lock besides getting or
250	setting the field is inc_vnode_ref_count() on io_context::root.
251*/
252static mutex sIOContextRootLock = MUTEX_INITIALIZER("io_context::root lock");
253
254
255#define VNODE_HASH_TABLE_SIZE 1024
256static hash_table* sVnodeTable;
257static struct vnode* sRoot;
258
259#define MOUNTS_HASH_TABLE_SIZE 16
260static hash_table* sMountsTable;
261static dev_t sNextMountID = 1;
262
263#define MAX_TEMP_IO_VECS 8
264
265mode_t __gUmask = 022;
266
267/* function declarations */
268
269static void free_unused_vnodes();
270
271// file descriptor operation prototypes
272static status_t file_read(struct file_descriptor* descriptor, off_t pos,
273	void* buffer, size_t* _bytes);
274static status_t file_write(struct file_descriptor* descriptor, off_t pos,
275	const void* buffer, size_t* _bytes);
276static off_t file_seek(struct file_descriptor* descriptor, off_t pos,
277	int seekType);
278static void file_free_fd(struct file_descriptor* descriptor);
279static status_t file_close(struct file_descriptor* descriptor);
280static status_t file_select(struct file_descriptor* descriptor, uint8 event,
281	struct selectsync* sync);
282static status_t file_deselect(struct file_descriptor* descriptor, uint8 event,
283	struct selectsync* sync);
284static status_t dir_read(struct io_context* context,
285	struct file_descriptor* descriptor, struct dirent* buffer,
286	size_t bufferSize, uint32* _count);
287static status_t dir_read(struct io_context* ioContext, struct vnode* vnode,
288	void* cookie, struct dirent* buffer, size_t bufferSize, uint32* _count);
289static status_t dir_rewind(struct file_descriptor* descriptor);
290static void dir_free_fd(struct file_descriptor* descriptor);
291static status_t dir_close(struct file_descriptor* descriptor);
292static status_t attr_dir_read(struct io_context* context,
293	struct file_descriptor* descriptor, struct dirent* buffer,
294	size_t bufferSize, uint32* _count);
295static status_t attr_dir_rewind(struct file_descriptor* descriptor);
296static void attr_dir_free_fd(struct file_descriptor* descriptor);
297static status_t attr_dir_close(struct file_descriptor* descriptor);
298static status_t attr_read(struct file_descriptor* descriptor, off_t pos,
299	void* buffer, size_t* _bytes);
300static status_t attr_write(struct file_descriptor* descriptor, off_t pos,
301	const void* buffer, size_t* _bytes);
302static off_t attr_seek(struct file_descriptor* descriptor, off_t pos,
303	int seekType);
304static void attr_free_fd(struct file_descriptor* descriptor);
305static status_t attr_close(struct file_descriptor* descriptor);
306static status_t attr_read_stat(struct file_descriptor* descriptor,
307	struct stat* statData);
308static status_t attr_write_stat(struct file_descriptor* descriptor,
309	const struct stat* stat, int statMask);
310static status_t index_dir_read(struct io_context* context,
311	struct file_descriptor* descriptor, struct dirent* buffer,
312	size_t bufferSize, uint32* _count);
313static status_t index_dir_rewind(struct file_descriptor* descriptor);
314static void index_dir_free_fd(struct file_descriptor* descriptor);
315static status_t index_dir_close(struct file_descriptor* descriptor);
316static status_t query_read(struct io_context* context,
317	struct file_descriptor* descriptor, struct dirent* buffer,
318	size_t bufferSize, uint32* _count);
319static status_t query_rewind(struct file_descriptor* descriptor);
320static void query_free_fd(struct file_descriptor* descriptor);
321static status_t query_close(struct file_descriptor* descriptor);
322
323static status_t common_ioctl(struct file_descriptor* descriptor, ulong op,
324	void* buffer, size_t length);
325static status_t common_read_stat(struct file_descriptor* descriptor,
326	struct stat* statData);
327static status_t common_write_stat(struct file_descriptor* descriptor,
328	const struct stat* statData, int statMask);
329static status_t common_path_read_stat(int fd, char* path, bool traverseLeafLink,
330	struct stat* stat, bool kernel);
331
332static status_t vnode_path_to_vnode(struct vnode* vnode, char* path,
333	bool traverseLeafLink, int count, bool kernel,
334	struct vnode** _vnode, ino_t* _parentID);
335static status_t dir_vnode_to_path(struct vnode* vnode, char* buffer,
336	size_t bufferSize, bool kernel);
337static status_t fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
338	struct vnode** _vnode, ino_t* _parentID, bool kernel);
339static void inc_vnode_ref_count(struct vnode* vnode);
340static status_t dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree,
341	bool reenter);
342static inline void put_vnode(struct vnode* vnode);
343static status_t fs_unmount(char* path, dev_t mountID, uint32 flags,
344	bool kernel);
345static int open_vnode(struct vnode* vnode, int openMode, bool kernel);
346
347
348static struct fd_ops sFileOps = {
349	file_read,
350	file_write,
351	file_seek,
352	common_ioctl,
353	NULL,		// set_flags
354	file_select,
355	file_deselect,
356	NULL,		// read_dir()
357	NULL,		// rewind_dir()
358	common_read_stat,
359	common_write_stat,
360	file_close,
361	file_free_fd
362};
363
364static struct fd_ops sDirectoryOps = {
365	NULL,		// read()
366	NULL,		// write()
367	NULL,		// seek()
368	common_ioctl,
369	NULL,		// set_flags
370	NULL,		// select()
371	NULL,		// deselect()
372	dir_read,
373	dir_rewind,
374	common_read_stat,
375	common_write_stat,
376	dir_close,
377	dir_free_fd
378};
379
380static struct fd_ops sAttributeDirectoryOps = {
381	NULL,		// read()
382	NULL,		// write()
383	NULL,		// seek()
384	common_ioctl,
385	NULL,		// set_flags
386	NULL,		// select()
387	NULL,		// deselect()
388	attr_dir_read,
389	attr_dir_rewind,
390	common_read_stat,
391	common_write_stat,
392	attr_dir_close,
393	attr_dir_free_fd
394};
395
396static struct fd_ops sAttributeOps = {
397	attr_read,
398	attr_write,
399	attr_seek,
400	common_ioctl,
401	NULL,		// set_flags
402	NULL,		// select()
403	NULL,		// deselect()
404	NULL,		// read_dir()
405	NULL,		// rewind_dir()
406	attr_read_stat,
407	attr_write_stat,
408	attr_close,
409	attr_free_fd
410};
411
412static struct fd_ops sIndexDirectoryOps = {
413	NULL,		// read()
414	NULL,		// write()
415	NULL,		// seek()
416	NULL,		// ioctl()
417	NULL,		// set_flags
418	NULL,		// select()
419	NULL,		// deselect()
420	index_dir_read,
421	index_dir_rewind,
422	NULL,		// read_stat()
423	NULL,		// write_stat()
424	index_dir_close,
425	index_dir_free_fd
426};
427
428#if 0
429static struct fd_ops sIndexOps = {
430	NULL,		// read()
431	NULL,		// write()
432	NULL,		// seek()
433	NULL,		// ioctl()
434	NULL,		// set_flags
435	NULL,		// select()
436	NULL,		// deselect()
437	NULL,		// dir_read()
438	NULL,		// dir_rewind()
439	index_read_stat,	// read_stat()
440	NULL,		// write_stat()
441	NULL,		// dir_close()
442	NULL		// free_fd()
443};
444#endif
445
446static struct fd_ops sQueryOps = {
447	NULL,		// read()
448	NULL,		// write()
449	NULL,		// seek()
450	NULL,		// ioctl()
451	NULL,		// set_flags
452	NULL,		// select()
453	NULL,		// deselect()
454	query_read,
455	query_rewind,
456	NULL,		// read_stat()
457	NULL,		// write_stat()
458	query_close,
459	query_free_fd
460};
461
462
463// VNodePutter
464class VNodePutter {
465public:
466	VNodePutter(struct vnode* vnode = NULL) : fVNode(vnode) {}
467
468	~VNodePutter()
469	{
470		Put();
471	}
472
473	void SetTo(struct vnode* vnode)
474	{
475		Put();
476		fVNode = vnode;
477	}
478
479	void Put()
480	{
481		if (fVNode) {
482			put_vnode(fVNode);
483			fVNode = NULL;
484		}
485	}
486
487	struct vnode* Detach()
488	{
489		struct vnode* vnode = fVNode;
490		fVNode = NULL;
491		return vnode;
492	}
493
494private:
495	struct vnode* fVNode;
496};
497
498
499class FDCloser {
500public:
501	FDCloser() : fFD(-1), fKernel(true) {}
502
503	FDCloser(int fd, bool kernel) : fFD(fd), fKernel(kernel) {}
504
505	~FDCloser()
506	{
507		Close();
508	}
509
510	void SetTo(int fd, bool kernel)
511	{
512		Close();
513		fFD = fd;
514		fKernel = kernel;
515	}
516
517	void Close()
518	{
519		if (fFD >= 0) {
520			if (fKernel)
521				_kern_close(fFD);
522			else
523				_user_close(fFD);
524			fFD = -1;
525		}
526	}
527
528	int Detach()
529	{
530		int fd = fFD;
531		fFD = -1;
532		return fd;
533	}
534
535private:
536	int		fFD;
537	bool	fKernel;
538};
539
540
541#if VFS_PAGES_IO_TRACING
542
543namespace VFSPagesIOTracing {
544
545class PagesIOTraceEntry : public AbstractTraceEntry {
546protected:
547	PagesIOTraceEntry(struct vnode* vnode, void* cookie, off_t pos,
548		const generic_io_vec* vecs, uint32 count, uint32 flags, generic_size_t bytesRequested,
549		status_t status, generic_size_t bytesTransferred)
550		:
551		fVnode(vnode),
552		fMountID(vnode->mount->id),
553		fNodeID(vnode->id),
554		fCookie(cookie),
555		fPos(pos),
556		fCount(count),
557		fFlags(flags),
558		fBytesRequested(bytesRequested),
559		fStatus(status),
560		fBytesTransferred(bytesTransferred)
561	{
562		fVecs = (generic_io_vec*)alloc_tracing_buffer_memcpy(vecs, sizeof(generic_io_vec) * count,
563			false);
564	}
565
566	void AddDump(TraceOutput& out, const char* mode)
567	{
568		out.Print("vfs pages io %5s: vnode: %p (%ld, %lld), cookie: %p, "
569			"pos: %lld, size: %llu, vecs: {", mode, fVnode, fMountID, fNodeID,
570			fCookie, fPos, (uint64)fBytesRequested);
571
572		if (fVecs != NULL) {
573			for (uint32 i = 0; i < fCount; i++) {
574				if (i > 0)
575					out.Print(", ");
576				out.Print("(%llx, %llu)", (uint64)fVecs[i].base, (uint64)fVecs[i].length);
577			}
578		}
579
580		out.Print("}, flags: %#lx -> status: %#lx, transferred: %llu",
581			fFlags, fStatus, (uint64)fBytesTransferred);
582	}
583
584protected:
585	struct vnode*	fVnode;
586	dev_t			fMountID;
587	ino_t			fNodeID;
588	void*			fCookie;
589	off_t			fPos;
590	generic_io_vec*		fVecs;
591	uint32			fCount;
592	uint32			fFlags;
593	generic_size_t			fBytesRequested;
594	status_t		fStatus;
595	generic_size_t			fBytesTransferred;
596};
597
598
599class ReadPages : public PagesIOTraceEntry {
600public:
601	ReadPages(struct vnode* vnode, void* cookie, off_t pos,
602		const generic_io_vec* vecs, uint32 count, uint32 flags, generic_size_t bytesRequested,
603		status_t status, generic_size_t bytesTransferred)
604		:
605		PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
606			bytesRequested, status, bytesTransferred)
607	{
608		Initialized();
609	}
610
611	virtual void AddDump(TraceOutput& out)
612	{
613		PagesIOTraceEntry::AddDump(out, "read");
614	}
615};
616
617
618class WritePages : public PagesIOTraceEntry {
619public:
620	WritePages(struct vnode* vnode, void* cookie, off_t pos,
621		const generic_io_vec* vecs, uint32 count, uint32 flags, generic_size_t bytesRequested,
622		status_t status, generic_size_t bytesTransferred)
623		:
624		PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
625			bytesRequested, status, bytesTransferred)
626	{
627		Initialized();
628	}
629
630	virtual void AddDump(TraceOutput& out)
631	{
632		PagesIOTraceEntry::AddDump(out, "write");
633	}
634};
635
636}	// namespace VFSPagesIOTracing
637
638#	define TPIO(x) new(std::nothrow) VFSPagesIOTracing::x;
639#else
640#	define TPIO(x) ;
641#endif	// VFS_PAGES_IO_TRACING
642
643
644static int
645mount_compare(void* _m, const void* _key)
646{
647	struct fs_mount* mount = (fs_mount*)_m;
648	const dev_t* id = (dev_t*)_key;
649
650	if (mount->id == *id)
651		return 0;
652
653	return -1;
654}
655
656
657static uint32
658mount_hash(void* _m, const void* _key, uint32 range)
659{
660	struct fs_mount* mount = (fs_mount*)_m;
661	const dev_t* id = (dev_t*)_key;
662
663	if (mount)
664		return mount->id % range;
665
666	return (uint32)*id % range;
667}
668
669
670/*! Finds the mounted device (the fs_mount structure) with the given ID.
671	Note, you must hold the gMountMutex lock when you call this function.
672*/
673static struct fs_mount*
674find_mount(dev_t id)
675{
676	ASSERT_LOCKED_MUTEX(&sMountMutex);
677
678	return (fs_mount*)hash_lookup(sMountsTable, (void*)&id);
679}
680
681
682static status_t
683get_mount(dev_t id, struct fs_mount** _mount)
684{
685	struct fs_mount* mount;
686
687	ReadLocker nodeLocker(sVnodeLock);
688	MutexLocker mountLocker(sMountMutex);
689
690	mount = find_mount(id);
691	if (mount == NULL)
692		return B_BAD_VALUE;
693
694	struct vnode* rootNode = mount->root_vnode;
695	if (rootNode == NULL || rootNode->IsBusy() || rootNode->ref_count == 0) {
696		// might have been called during a mount/unmount operation
697		return B_BUSY;
698	}
699
700	inc_vnode_ref_count(mount->root_vnode);
701	*_mount = mount;
702	return B_OK;
703}
704
705
706static void
707put_mount(struct fs_mount* mount)
708{
709	if (mount)
710		put_vnode(mount->root_vnode);
711}
712
713
714/*!	Tries to open the specified file system module.
715	Accepts a file system name of the form "bfs" or "file_systems/bfs/v1".
716	Returns a pointer to file system module interface, or NULL if it
717	could not open the module.
718*/
719static file_system_module_info*
720get_file_system(const char* fsName)
721{
722	char name[B_FILE_NAME_LENGTH];
723	if (strncmp(fsName, "file_systems/", strlen("file_systems/"))) {
724		// construct module name if we didn't get one
725		// (we currently support only one API)
726		snprintf(name, sizeof(name), "file_systems/%s/v1", fsName);
727		fsName = NULL;
728	}
729
730	file_system_module_info* info;
731	if (get_module(fsName ? fsName : name, (module_info**)&info) != B_OK)
732		return NULL;
733
734	return info;
735}
736
737
738/*!	Accepts a file system name of the form "bfs" or "file_systems/bfs/v1"
739	and returns a compatible fs_info.fsh_name name ("bfs" in both cases).
740	The name is allocated for you, and you have to free() it when you're
741	done with it.
742	Returns NULL if the required memory is not available.
743*/
744static char*
745get_file_system_name(const char* fsName)
746{
747	const size_t length = strlen("file_systems/");
748
749	if (strncmp(fsName, "file_systems/", length)) {
750		// the name already seems to be the module's file name
751		return strdup(fsName);
752	}
753
754	fsName += length;
755	const char* end = strchr(fsName, '/');
756	if (end == NULL) {
757		// this doesn't seem to be a valid name, but well...
758		return strdup(fsName);
759	}
760
761	// cut off the trailing /v1
762
763	char* name = (char*)malloc(end + 1 - fsName);
764	if (name == NULL)
765		return NULL;
766
767	strlcpy(name, fsName, end + 1 - fsName);
768	return name;
769}
770
771
772/*!	Accepts a list of file system names separated by a colon, one for each
773	layer and returns the file system name for the specified layer.
774	The name is allocated for you, and you have to free() it when you're
775	done with it.
776	Returns NULL if the required memory is not available or if there is no
777	name for the specified layer.
778*/
779static char*
780get_file_system_name_for_layer(const char* fsNames, int32 layer)
781{
782	while (layer >= 0) {
783		const char* end = strchr(fsNames, ':');
784		if (end == NULL) {
785			if (layer == 0)
786				return strdup(fsNames);
787			return NULL;
788		}
789
790		if (layer == 0) {
791			size_t length = end - fsNames + 1;
792			char* result = (char*)malloc(length);
793			strlcpy(result, fsNames, length);
794			return result;
795		}
796
797		fsNames = end + 1;
798		layer--;
799	}
800
801	return NULL;
802}
803
804
805static int
806vnode_compare(void* _vnode, const void* _key)
807{
808	struct vnode* vnode = (struct vnode*)_vnode;
809	const struct vnode_hash_key* key = (vnode_hash_key*)_key;
810
811	if (vnode->device == key->device && vnode->id == key->vnode)
812		return 0;
813
814	return -1;
815}
816
817
818static uint32
819vnode_hash(void* _vnode, const void* _key, uint32 range)
820{
821	struct vnode* vnode = (struct vnode*)_vnode;
822	const struct vnode_hash_key* key = (vnode_hash_key*)_key;
823
824#define VHASH(mountid, vnodeid) \
825	(((uint32)((vnodeid) >> 32) + (uint32)(vnodeid)) ^ (uint32)(mountid))
826
827	if (vnode != NULL)
828		return VHASH(vnode->device, vnode->id) % range;
829
830	return VHASH(key->device, key->vnode) % range;
831
832#undef VHASH
833}
834
835
836static void
837add_vnode_to_mount_list(struct vnode* vnode, struct fs_mount* mount)
838{
839	RecursiveLocker _(mount->rlock);
840	mount->vnodes.Add(vnode);
841}
842
843
844static void
845remove_vnode_from_mount_list(struct vnode* vnode, struct fs_mount* mount)
846{
847	RecursiveLocker _(mount->rlock);
848	mount->vnodes.Remove(vnode);
849}
850
851
852/*!	\brief Looks up a vnode by mount and node ID in the sVnodeTable.
853
854	The caller must hold the sVnodeLock (read lock at least).
855
856	\param mountID the mount ID.
857	\param vnodeID the node ID.
858
859	\return The vnode structure, if it was found in the hash table, \c NULL
860			otherwise.
861*/
862static struct vnode*
863lookup_vnode(dev_t mountID, ino_t vnodeID)
864{
865	struct vnode_hash_key key;
866
867	key.device = mountID;
868	key.vnode = vnodeID;
869
870	return (vnode*)hash_lookup(sVnodeTable, &key);
871}
872
873
874/*!	Creates a new vnode with the given mount and node ID.
875	If the node already exists, it is returned instead and no new node is
876	created. In either case -- but not, if an error occurs -- the function write
877	locks \c sVnodeLock and keeps it locked for the caller when returning. On
878	error the lock is not not held on return.
879
880	\param mountID The mount ID.
881	\param vnodeID The vnode ID.
882	\param _vnode Will be set to the new vnode on success.
883	\param _nodeCreated Will be set to \c true when the returned vnode has
884		been newly created, \c false when it already existed. Will not be
885		changed on error.
886	\return \c B_OK, when the vnode was successfully created and inserted or
887		a node with the given ID was found, \c B_NO_MEMORY or
888		\c B_ENTRY_NOT_FOUND on error.
889*/
890static status_t
891create_new_vnode_and_lock(dev_t mountID, ino_t vnodeID, struct vnode*& _vnode,
892	bool& _nodeCreated)
893{
894	FUNCTION(("create_new_vnode_and_lock()\n"));
895
896	struct vnode* vnode = (struct vnode*)malloc(sizeof(struct vnode));
897	if (vnode == NULL)
898		return B_NO_MEMORY;
899
900	// initialize basic values
901	memset(vnode, 0, sizeof(struct vnode));
902	vnode->device = mountID;
903	vnode->id = vnodeID;
904	vnode->ref_count = 1;
905	vnode->SetBusy(true);
906
907	// look up the the node -- it might have been added by someone else in the
908	// meantime
909	rw_lock_write_lock(&sVnodeLock);
910	struct vnode* existingVnode = lookup_vnode(mountID, vnodeID);
911	if (existingVnode != NULL) {
912		free(vnode);
913		_vnode = existingVnode;
914		_nodeCreated = false;
915		return B_OK;
916	}
917
918	// get the mount structure
919	mutex_lock(&sMountMutex);
920	vnode->mount = find_mount(mountID);
921	if (!vnode->mount || vnode->mount->unmounting) {
922		mutex_unlock(&sMountMutex);
923		rw_lock_write_unlock(&sVnodeLock);
924		free(vnode);
925		return B_ENTRY_NOT_FOUND;
926	}
927
928	// add the vnode to the mount's node list and the hash table
929	hash_insert(sVnodeTable, vnode);
930	add_vnode_to_mount_list(vnode, vnode->mount);
931
932	mutex_unlock(&sMountMutex);
933
934	_vnode = vnode;
935	_nodeCreated = true;
936
937	// keep the vnode lock locked
938	return B_OK;
939}
940
941
942/*!	Frees the vnode and all resources it has acquired, and removes
943	it from the vnode hash as well as from its mount structure.
944	Will also make sure that any cache modifications are written back.
945*/
946static void
947free_vnode(struct vnode* vnode, bool reenter)
948{
949	ASSERT_PRINT(vnode->ref_count == 0 && vnode->IsBusy(), "vnode: %p\n",
950		vnode);
951
952	// write back any changes in this vnode's cache -- but only
953	// if the vnode won't be deleted, in which case the changes
954	// will be discarded
955
956	if (!vnode->IsRemoved() && HAS_FS_CALL(vnode, fsync))
957		FS_CALL_NO_PARAMS(vnode, fsync);
958
959	// Note: If this vnode has a cache attached, there will still be two
960	// references to that cache at this point. The last one belongs to the vnode
961	// itself (cf. vfs_get_vnode_cache()) and one belongs to the node's file
962	// cache. Each but the last reference to a cache also includes a reference
963	// to the vnode. The file cache, however, released its reference (cf.
964	// file_cache_create()), so that this vnode's ref count has the chance to
965	// ever drop to 0. Deleting the file cache now, will cause the next to last
966	// cache reference to be released, which will also release a (no longer
967	// existing) vnode reference. To avoid problems, we set the vnode's ref
968	// count, so that it will neither become negative nor 0.
969	vnode->ref_count = 2;
970
971	if (!vnode->IsUnpublished()) {
972		if (vnode->IsRemoved())
973			FS_CALL(vnode, remove_vnode, reenter);
974		else
975			FS_CALL(vnode, put_vnode, reenter);
976	}
977
978	// If the vnode has a VMCache attached, make sure that it won't try to get
979	// another reference via VMVnodeCache::AcquireUnreferencedStoreRef(). As
980	// long as the vnode is busy and in the hash, that won't happen, but as
981	// soon as we've removed it from the hash, it could reload the vnode -- with
982	// a new cache attached!
983	if (vnode->cache != NULL)
984		((VMVnodeCache*)vnode->cache)->VnodeDeleted();
985
986	// The file system has removed the resources of the vnode now, so we can
987	// make it available again (by removing the busy vnode from the hash).
988	rw_lock_write_lock(&sVnodeLock);
989	hash_remove(sVnodeTable, vnode);
990	rw_lock_write_unlock(&sVnodeLock);
991
992	// if we have a VMCache attached, remove it
993	if (vnode->cache)
994		vnode->cache->ReleaseRef();
995
996	vnode->cache = NULL;
997
998	remove_vnode_from_mount_list(vnode, vnode->mount);
999
1000	free(vnode);
1001}
1002
1003
1004/*!	\brief Decrements the reference counter of the given vnode and deletes it,
1005	if the counter dropped to 0.
1006
1007	The caller must, of course, own a reference to the vnode to call this
1008	function.
1009	The caller must not hold the sVnodeLock or the sMountMutex.
1010
1011	\param vnode the vnode.
1012	\param alwaysFree don't move this vnode into the unused list, but really
1013		   delete it if possible.
1014	\param reenter \c true, if this function is called (indirectly) from within
1015		   a file system. This will be passed to file system hooks only.
1016	\return \c B_OK, if everything went fine, an error code otherwise.
1017*/
1018static status_t
1019dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree, bool reenter)
1020{
1021	ReadLocker locker(sVnodeLock);
1022	AutoLocker<Vnode> nodeLocker(vnode);
1023
1024	int32 oldRefCount = atomic_add(&vnode->ref_count, -1);
1025
1026	ASSERT_PRINT(oldRefCount > 0, "vnode %p\n", vnode);
1027
1028	TRACE(("dec_vnode_ref_count: vnode %p, ref now %ld\n", vnode,
1029		vnode->ref_count));
1030
1031	if (oldRefCount != 1)
1032		return B_OK;
1033
1034	if (vnode->IsBusy())
1035		panic("dec_vnode_ref_count: called on busy vnode %p\n", vnode);
1036
1037	bool freeNode = false;
1038	bool freeUnusedNodes = false;
1039
1040	// Just insert the vnode into an unused list if we don't need
1041	// to delete it
1042	if (vnode->IsRemoved() || alwaysFree) {
1043		vnode_to_be_freed(vnode);
1044		vnode->SetBusy(true);
1045		freeNode = true;
1046	} else
1047		freeUnusedNodes = vnode_unused(vnode);
1048
1049	nodeLocker.Unlock();
1050	locker.Unlock();
1051
1052	if (freeNode)
1053		free_vnode(vnode, reenter);
1054	else if (freeUnusedNodes)
1055		free_unused_vnodes();
1056
1057	return B_OK;
1058}
1059
1060
1061/*!	\brief Increments the reference counter of the given vnode.
1062
1063	The caller must make sure that the node isn't deleted while this function
1064	is called. This can be done either:
1065	- by ensuring that a reference to the node exists and remains in existence,
1066	  or
1067	- by holding the vnode's lock (which also requires read locking sVnodeLock)
1068	  or by holding sVnodeLock write locked.
1069
1070	In the second case the caller is responsible for dealing with the ref count
1071	0 -> 1 transition. That is 1. this function must not be invoked when the
1072	node is busy in the first place and 2. vnode_used() must be called for the
1073	node.
1074
1075	\param vnode the vnode.
1076*/
1077static void
1078inc_vnode_ref_count(struct vnode* vnode)
1079{
1080	atomic_add(&vnode->ref_count, 1);
1081	TRACE(("inc_vnode_ref_count: vnode %p, ref now %ld\n", vnode,
1082		vnode->ref_count));
1083}
1084
1085
1086static bool
1087is_special_node_type(int type)
1088{
1089	// at the moment only FIFOs are supported
1090	return S_ISFIFO(type);
1091}
1092
1093
1094static status_t
1095create_special_sub_node(struct vnode* vnode, uint32 flags)
1096{
1097	if (S_ISFIFO(vnode->Type()))
1098		return create_fifo_vnode(vnode->mount->volume, vnode);
1099
1100	return B_BAD_VALUE;
1101}
1102
1103
1104/*!	\brief Retrieves a vnode for a given mount ID, node ID pair.
1105
1106	If the node is not yet in memory, it will be loaded.
1107
1108	The caller must not hold the sVnodeLock or the sMountMutex.
1109
1110	\param mountID the mount ID.
1111	\param vnodeID the node ID.
1112	\param _vnode Pointer to a vnode* variable into which the pointer to the
1113		   retrieved vnode structure shall be written.
1114	\param reenter \c true, if this function is called (indirectly) from within
1115		   a file system.
1116	\return \c B_OK, if everything when fine, an error code otherwise.
1117*/
1118static status_t
1119get_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode, bool canWait,
1120	int reenter)
1121{
1122	FUNCTION(("get_vnode: mountid %ld vnid 0x%Lx %p\n", mountID, vnodeID,
1123		_vnode));
1124
1125	rw_lock_read_lock(&sVnodeLock);
1126
1127	int32 tries = 2000;
1128		// try for 10 secs
1129restart:
1130	struct vnode* vnode = lookup_vnode(mountID, vnodeID);
1131	AutoLocker<Vnode> nodeLocker(vnode);
1132
1133	if (vnode && vnode->IsBusy()) {
1134		nodeLocker.Unlock();
1135		rw_lock_read_unlock(&sVnodeLock);
1136		if (!canWait || --tries < 0) {
1137			// vnode doesn't seem to become unbusy
1138			dprintf("vnode %" B_PRIdDEV ":%" B_PRIdINO " is not becoming unbusy!\n",
1139				mountID, vnodeID);
1140			return B_BUSY;
1141		}
1142		snooze(5000); // 5 ms
1143		rw_lock_read_lock(&sVnodeLock);
1144		goto restart;
1145	}
1146
1147	TRACE(("get_vnode: tried to lookup vnode, got %p\n", vnode));
1148
1149	status_t status;
1150
1151	if (vnode) {
1152		if (vnode->ref_count == 0) {
1153			// this vnode has been unused before
1154			vnode_used(vnode);
1155		}
1156		inc_vnode_ref_count(vnode);
1157
1158		nodeLocker.Unlock();
1159		rw_lock_read_unlock(&sVnodeLock);
1160	} else {
1161		// we need to create a new vnode and read it in
1162		rw_lock_read_unlock(&sVnodeLock);
1163			// unlock -- create_new_vnode_and_lock() write-locks on success
1164		bool nodeCreated;
1165		status = create_new_vnode_and_lock(mountID, vnodeID, vnode,
1166			nodeCreated);
1167		if (status != B_OK)
1168			return status;
1169
1170		if (!nodeCreated) {
1171			rw_lock_read_lock(&sVnodeLock);
1172			rw_lock_write_unlock(&sVnodeLock);
1173			goto restart;
1174		}
1175
1176		rw_lock_write_unlock(&sVnodeLock);
1177
1178		int type;
1179		uint32 flags;
1180		status = FS_MOUNT_CALL(vnode->mount, get_vnode, vnodeID, vnode, &type,
1181			&flags, reenter);
1182		if (status == B_OK && vnode->private_node == NULL)
1183			status = B_BAD_VALUE;
1184
1185		bool gotNode = status == B_OK;
1186		bool publishSpecialSubNode = false;
1187		if (gotNode) {
1188			vnode->SetType(type);
1189			publishSpecialSubNode = is_special_node_type(type)
1190				&& (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
1191		}
1192
1193		if (gotNode && publishSpecialSubNode)
1194			status = create_special_sub_node(vnode, flags);
1195
1196		if (status != B_OK) {
1197			if (gotNode)
1198				FS_CALL(vnode, put_vnode, reenter);
1199
1200			rw_lock_write_lock(&sVnodeLock);
1201			hash_remove(sVnodeTable, vnode);
1202			remove_vnode_from_mount_list(vnode, vnode->mount);
1203			rw_lock_write_unlock(&sVnodeLock);
1204
1205			free(vnode);
1206			return status;
1207		}
1208
1209		rw_lock_read_lock(&sVnodeLock);
1210		vnode->Lock();
1211
1212		vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
1213		vnode->SetBusy(false);
1214
1215		vnode->Unlock();
1216		rw_lock_read_unlock(&sVnodeLock);
1217	}
1218
1219	TRACE(("get_vnode: returning %p\n", vnode));
1220
1221	*_vnode = vnode;
1222	return B_OK;
1223}
1224
1225
1226/*!	\brief Decrements the reference counter of the given vnode and deletes it,
1227	if the counter dropped to 0.
1228
1229	The caller must, of course, own a reference to the vnode to call this
1230	function.
1231	The caller must not hold the sVnodeLock or the sMountMutex.
1232
1233	\param vnode the vnode.
1234*/
1235static inline void
1236put_vnode(struct vnode* vnode)
1237{
1238	dec_vnode_ref_count(vnode, false, false);
1239}
1240
1241
1242static void
1243free_unused_vnodes(int32 level)
1244{
1245	unused_vnodes_check_started();
1246
1247	if (level == B_NO_LOW_RESOURCE) {
1248		unused_vnodes_check_done();
1249		return;
1250	}
1251
1252	flush_hot_vnodes();
1253
1254	// determine how many nodes to free
1255	uint32 count = 1;
1256	{
1257		MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1258
1259		switch (level) {
1260			case B_LOW_RESOURCE_NOTE:
1261				count = sUnusedVnodes / 100;
1262				break;
1263			case B_LOW_RESOURCE_WARNING:
1264				count = sUnusedVnodes / 10;
1265				break;
1266			case B_LOW_RESOURCE_CRITICAL:
1267				count = sUnusedVnodes;
1268				break;
1269		}
1270
1271		if (count > sUnusedVnodes)
1272			count = sUnusedVnodes;
1273	}
1274
1275	// Write back the modified pages of some unused vnodes and free them.
1276
1277	for (uint32 i = 0; i < count; i++) {
1278		ReadLocker vnodesReadLocker(sVnodeLock);
1279
1280		// get the first node
1281		MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1282		struct vnode* vnode = (struct vnode*)list_get_first_item(
1283			&sUnusedVnodeList);
1284		unusedVnodesLocker.Unlock();
1285
1286		if (vnode == NULL)
1287			break;
1288
1289		// lock the node
1290		AutoLocker<Vnode> nodeLocker(vnode);
1291
1292		// Check whether the node is still unused -- since we only append to the
1293		// the tail of the unused queue, the vnode should still be at its head.
1294		// Alternatively we could check its ref count for 0 and its busy flag,
1295		// but if the node is no longer at the head of the queue, it means it
1296		// has been touched in the meantime, i.e. it is no longer the least
1297		// recently used unused vnode and we rather don't free it.
1298		unusedVnodesLocker.Lock();
1299		if (vnode != list_get_first_item(&sUnusedVnodeList))
1300			continue;
1301		unusedVnodesLocker.Unlock();
1302
1303		ASSERT(!vnode->IsBusy());
1304
1305		// grab a reference
1306		inc_vnode_ref_count(vnode);
1307		vnode_used(vnode);
1308
1309		// write back changes and free the node
1310		nodeLocker.Unlock();
1311		vnodesReadLocker.Unlock();
1312
1313		if (vnode->cache != NULL)
1314			vnode->cache->WriteModified();
1315
1316		dec_vnode_ref_count(vnode, true, false);
1317			// this should free the vnode when it's still unused
1318	}
1319
1320	unused_vnodes_check_done();
1321}
1322
1323
1324/*!	Gets the vnode the given vnode is covering.
1325
1326	The caller must have \c sVnodeLock read-locked at least.
1327
1328	The function returns a reference to the retrieved vnode (if any), the caller
1329	is responsible to free.
1330
1331	\param vnode The vnode whose covered node shall be returned.
1332	\return The covered vnode, or \c NULL if the given vnode doesn't cover any
1333		vnode.
1334*/
1335static inline Vnode*
1336get_covered_vnode_locked(Vnode* vnode)
1337{
1338	if (Vnode* coveredNode = vnode->covers) {
1339		while (coveredNode->covers != NULL)
1340			coveredNode = coveredNode->covers;
1341
1342		inc_vnode_ref_count(coveredNode);
1343		return coveredNode;
1344	}
1345
1346	return NULL;
1347}
1348
1349
1350/*!	Gets the vnode the given vnode is covering.
1351
1352	The caller must not hold \c sVnodeLock. Note that this implies a race
1353	condition, since the situation can change at any time.
1354
1355	The function returns a reference to the retrieved vnode (if any), the caller
1356	is responsible to free.
1357
1358	\param vnode The vnode whose covered node shall be returned.
1359	\return The covered vnode, or \c NULL if the given vnode doesn't cover any
1360		vnode.
1361*/
1362static inline Vnode*
1363get_covered_vnode(Vnode* vnode)
1364{
1365	if (!vnode->IsCovering())
1366		return NULL;
1367
1368	ReadLocker vnodeReadLocker(sVnodeLock);
1369	return get_covered_vnode_locked(vnode);
1370}
1371
1372
1373/*!	Gets the vnode the given vnode is covered by.
1374
1375	The caller must have \c sVnodeLock read-locked at least.
1376
1377	The function returns a reference to the retrieved vnode (if any), the caller
1378	is responsible to free.
1379
1380	\param vnode The vnode whose covering node shall be returned.
1381	\return The covering vnode, or \c NULL if the given vnode isn't covered by
1382		any vnode.
1383*/
1384static Vnode*
1385get_covering_vnode_locked(Vnode* vnode)
1386{
1387	if (Vnode* coveringNode = vnode->covered_by) {
1388		while (coveringNode->covered_by != NULL)
1389			coveringNode = coveringNode->covered_by;
1390
1391		inc_vnode_ref_count(coveringNode);
1392		return coveringNode;
1393	}
1394
1395	return NULL;
1396}
1397
1398
1399/*!	Gets the vnode the given vnode is covered by.
1400
1401	The caller must not hold \c sVnodeLock. Note that this implies a race
1402	condition, since the situation can change at any time.
1403
1404	The function returns a reference to the retrieved vnode (if any), the caller
1405	is responsible to free.
1406
1407	\param vnode The vnode whose covering node shall be returned.
1408	\return The covering vnode, or \c NULL if the given vnode isn't covered by
1409		any vnode.
1410*/
1411static inline Vnode*
1412get_covering_vnode(Vnode* vnode)
1413{
1414	if (!vnode->IsCovered())
1415		return NULL;
1416
1417	ReadLocker vnodeReadLocker(sVnodeLock);
1418	return get_covering_vnode_locked(vnode);
1419}
1420
1421
1422static void
1423free_unused_vnodes()
1424{
1425	free_unused_vnodes(
1426		low_resource_state(B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1427			| B_KERNEL_RESOURCE_ADDRESS_SPACE));
1428}
1429
1430
1431static void
1432vnode_low_resource_handler(void* /*data*/, uint32 resources, int32 level)
1433{
1434	TRACE(("vnode_low_resource_handler(level = %ld)\n", level));
1435
1436	free_unused_vnodes(level);
1437}
1438
1439
1440static inline void
1441put_advisory_locking(struct advisory_locking* locking)
1442{
1443	release_sem(locking->lock);
1444}
1445
1446
1447/*!	Returns the advisory_locking object of the \a vnode in case it
1448	has one, and locks it.
1449	You have to call put_advisory_locking() when you're done with
1450	it.
1451	Note, you must not have the vnode mutex locked when calling
1452	this function.
1453*/
1454static struct advisory_locking*
1455get_advisory_locking(struct vnode* vnode)
1456{
1457	rw_lock_read_lock(&sVnodeLock);
1458	vnode->Lock();
1459
1460	struct advisory_locking* locking = vnode->advisory_locking;
1461	sem_id lock = locking != NULL ? locking->lock : B_ERROR;
1462
1463	vnode->Unlock();
1464	rw_lock_read_unlock(&sVnodeLock);
1465
1466	if (lock >= 0)
1467		lock = acquire_sem(lock);
1468	if (lock < 0) {
1469		// This means the locking has been deleted in the mean time
1470		// or had never existed in the first place - otherwise, we
1471		// would get the lock at some point.
1472		return NULL;
1473	}
1474
1475	return locking;
1476}
1477
1478
1479/*!	Creates a locked advisory_locking object, and attaches it to the
1480	given \a vnode.
1481	Returns B_OK in case of success - also if the vnode got such an
1482	object from someone else in the mean time, you'll still get this
1483	one locked then.
1484*/
1485static status_t
1486create_advisory_locking(struct vnode* vnode)
1487{
1488	if (vnode == NULL)
1489		return B_FILE_ERROR;
1490
1491	ObjectDeleter<advisory_locking> lockingDeleter;
1492	struct advisory_locking* locking = NULL;
1493
1494	while (get_advisory_locking(vnode) == NULL) {
1495		// no locking object set on the vnode yet, create one
1496		if (locking == NULL) {
1497			locking = new(std::nothrow) advisory_locking;
1498			if (locking == NULL)
1499				return B_NO_MEMORY;
1500			lockingDeleter.SetTo(locking);
1501
1502			locking->wait_sem = create_sem(0, "advisory lock");
1503			if (locking->wait_sem < 0)
1504				return locking->wait_sem;
1505
1506			locking->lock = create_sem(0, "advisory locking");
1507			if (locking->lock < 0)
1508				return locking->lock;
1509		}
1510
1511		// set our newly created locking object
1512		ReadLocker _(sVnodeLock);
1513		AutoLocker<Vnode> nodeLocker(vnode);
1514		if (vnode->advisory_locking == NULL) {
1515			vnode->advisory_locking = locking;
1516			lockingDeleter.Detach();
1517			return B_OK;
1518		}
1519	}
1520
1521	// The vnode already had a locking object. That's just as well.
1522
1523	return B_OK;
1524}
1525
1526
1527/*!	Retrieves the first lock that has been set by the current team.
1528*/
1529static status_t
1530get_advisory_lock(struct vnode* vnode, struct flock* flock)
1531{
1532	struct advisory_locking* locking = get_advisory_locking(vnode);
1533	if (locking == NULL)
1534		return B_BAD_VALUE;
1535
1536	// TODO: this should probably get the flock by its file descriptor!
1537	team_id team = team_get_current_team_id();
1538	status_t status = B_BAD_VALUE;
1539
1540	LockList::Iterator iterator = locking->locks.GetIterator();
1541	while (iterator.HasNext()) {
1542		struct advisory_lock* lock = iterator.Next();
1543
1544		if (lock->team == team) {
1545			flock->l_start = lock->start;
1546			flock->l_len = lock->end - lock->start + 1;
1547			status = B_OK;
1548			break;
1549		}
1550	}
1551
1552	put_advisory_locking(locking);
1553	return status;
1554}
1555
1556
1557/*! Returns \c true when either \a flock is \c NULL or the \a flock intersects
1558	with the advisory_lock \a lock.
1559*/
1560static bool
1561advisory_lock_intersects(struct advisory_lock* lock, struct flock* flock)
1562{
1563	if (flock == NULL)
1564		return true;
1565
1566	return lock->start <= flock->l_start - 1 + flock->l_len
1567		&& lock->end >= flock->l_start;
1568}
1569
1570
1571/*!	Removes the specified lock, or all locks of the calling team
1572	if \a flock is NULL.
1573*/
1574static status_t
1575release_advisory_lock(struct vnode* vnode, struct flock* flock)
1576{
1577	FUNCTION(("release_advisory_lock(vnode = %p, flock = %p)\n", vnode, flock));
1578
1579	struct advisory_locking* locking = get_advisory_locking(vnode);
1580	if (locking == NULL)
1581		return B_OK;
1582
1583	// TODO: use the thread ID instead??
1584	team_id team = team_get_current_team_id();
1585	pid_t session = thread_get_current_thread()->team->session_id;
1586
1587	// find matching lock entries
1588
1589	LockList::Iterator iterator = locking->locks.GetIterator();
1590	while (iterator.HasNext()) {
1591		struct advisory_lock* lock = iterator.Next();
1592		bool removeLock = false;
1593
1594		if (lock->session == session)
1595			removeLock = true;
1596		else if (lock->team == team && advisory_lock_intersects(lock, flock)) {
1597			bool endsBeyond = false;
1598			bool startsBefore = false;
1599			if (flock != NULL) {
1600				startsBefore = lock->start < flock->l_start;
1601				endsBeyond = lock->end > flock->l_start - 1 + flock->l_len;
1602			}
1603
1604			if (!startsBefore && !endsBeyond) {
1605				// lock is completely contained in flock
1606				removeLock = true;
1607			} else if (startsBefore && !endsBeyond) {
1608				// cut the end of the lock
1609				lock->end = flock->l_start - 1;
1610			} else if (!startsBefore && endsBeyond) {
1611				// cut the start of the lock
1612				lock->start = flock->l_start + flock->l_len;
1613			} else {
1614				// divide the lock into two locks
1615				struct advisory_lock* secondLock = new advisory_lock;
1616				if (secondLock == NULL) {
1617					// TODO: we should probably revert the locks we already
1618					// changed... (ie. allocate upfront)
1619					put_advisory_locking(locking);
1620					return B_NO_MEMORY;
1621				}
1622
1623				lock->end = flock->l_start - 1;
1624
1625				secondLock->team = lock->team;
1626				secondLock->session = lock->session;
1627				// values must already be normalized when getting here
1628				secondLock->start = flock->l_start + flock->l_len;
1629				secondLock->end = lock->end;
1630				secondLock->shared = lock->shared;
1631
1632				locking->locks.Add(secondLock);
1633			}
1634		}
1635
1636		if (removeLock) {
1637			// this lock is no longer used
1638			iterator.Remove();
1639			free(lock);
1640		}
1641	}
1642
1643	bool removeLocking = locking->locks.IsEmpty();
1644	release_sem_etc(locking->wait_sem, 1, B_RELEASE_ALL);
1645
1646	put_advisory_locking(locking);
1647
1648	if (removeLocking) {
1649		// We can remove the whole advisory locking structure; it's no
1650		// longer used
1651		locking = get_advisory_locking(vnode);
1652		if (locking != NULL) {
1653			ReadLocker locker(sVnodeLock);
1654			AutoLocker<Vnode> nodeLocker(vnode);
1655
1656			// the locking could have been changed in the mean time
1657			if (locking->locks.IsEmpty()) {
1658				vnode->advisory_locking = NULL;
1659				nodeLocker.Unlock();
1660				locker.Unlock();
1661
1662				// we've detached the locking from the vnode, so we can
1663				// safely delete it
1664				delete locking;
1665			} else {
1666				// the locking is in use again
1667				nodeLocker.Unlock();
1668				locker.Unlock();
1669				release_sem_etc(locking->lock, 1, B_DO_NOT_RESCHEDULE);
1670			}
1671		}
1672	}
1673
1674	return B_OK;
1675}
1676
1677
1678/*!	Acquires an advisory lock for the \a vnode. If \a wait is \c true, it
1679	will wait for the lock to become available, if there are any collisions
1680	(it will return B_PERMISSION_DENIED in this case if \a wait is \c false).
1681
1682	If \a session is -1, POSIX semantics are used for this lock. Otherwise,
1683	BSD flock() semantics are used, that is, all children can unlock the file
1684	in question (we even allow parents to remove the lock, though, but that
1685	seems to be in line to what the BSD's are doing).
1686*/
1687static status_t
1688acquire_advisory_lock(struct vnode* vnode, pid_t session, struct flock* flock,
1689	bool wait)
1690{
1691	FUNCTION(("acquire_advisory_lock(vnode = %p, flock = %p, wait = %s)\n",
1692		vnode, flock, wait ? "yes" : "no"));
1693
1694	bool shared = flock->l_type == F_RDLCK;
1695	status_t status = B_OK;
1696
1697	// TODO: do deadlock detection!
1698
1699	struct advisory_locking* locking;
1700
1701	while (true) {
1702		// if this vnode has an advisory_locking structure attached,
1703		// lock that one and search for any colliding file lock
1704		status = create_advisory_locking(vnode);
1705		if (status != B_OK)
1706			return status;
1707
1708		locking = vnode->advisory_locking;
1709		team_id team = team_get_current_team_id();
1710		sem_id waitForLock = -1;
1711
1712		// test for collisions
1713		LockList::Iterator iterator = locking->locks.GetIterator();
1714		while (iterator.HasNext()) {
1715			struct advisory_lock* lock = iterator.Next();
1716
1717			// TODO: locks from the same team might be joinable!
1718			if (lock->team != team && advisory_lock_intersects(lock, flock)) {
1719				// locks do overlap
1720				if (!shared || !lock->shared) {
1721					// we need to wait
1722					waitForLock = locking->wait_sem;
1723					break;
1724				}
1725			}
1726		}
1727
1728		if (waitForLock < 0)
1729			break;
1730
1731		// We need to wait. Do that or fail now, if we've been asked not to.
1732
1733		if (!wait) {
1734			put_advisory_locking(locking);
1735			return session != -1 ? B_WOULD_BLOCK : B_PERMISSION_DENIED;
1736		}
1737
1738		status = switch_sem_etc(locking->lock, waitForLock, 1,
1739			B_CAN_INTERRUPT, 0);
1740		if (status != B_OK && status != B_BAD_SEM_ID)
1741			return status;
1742
1743		// We have been notified, but we need to re-lock the locking object. So
1744		// go another round...
1745	}
1746
1747	// install new lock
1748
1749	struct advisory_lock* lock = (struct advisory_lock*)malloc(
1750		sizeof(struct advisory_lock));
1751	if (lock == NULL) {
1752		put_advisory_locking(locking);
1753		return B_NO_MEMORY;
1754	}
1755
1756	lock->team = team_get_current_team_id();
1757	lock->session = session;
1758	// values must already be normalized when getting here
1759	lock->start = flock->l_start;
1760	lock->end = flock->l_start - 1 + flock->l_len;
1761	lock->shared = shared;
1762
1763	locking->locks.Add(lock);
1764	put_advisory_locking(locking);
1765
1766	return status;
1767}
1768
1769
1770/*!	Normalizes the \a flock structure to make it easier to compare the
1771	structure with others. The l_start and l_len fields are set to absolute
1772	values according to the l_whence field.
1773*/
1774static status_t
1775normalize_flock(struct file_descriptor* descriptor, struct flock* flock)
1776{
1777	switch (flock->l_whence) {
1778		case SEEK_SET:
1779			break;
1780		case SEEK_CUR:
1781			flock->l_start += descriptor->pos;
1782			break;
1783		case SEEK_END:
1784		{
1785			struct vnode* vnode = descriptor->u.vnode;
1786			struct stat stat;
1787			status_t status;
1788
1789			if (!HAS_FS_CALL(vnode, read_stat))
1790				return B_UNSUPPORTED;
1791
1792			status = FS_CALL(vnode, read_stat, &stat);
1793			if (status != B_OK)
1794				return status;
1795
1796			flock->l_start += stat.st_size;
1797			break;
1798		}
1799		default:
1800			return B_BAD_VALUE;
1801	}
1802
1803	if (flock->l_start < 0)
1804		flock->l_start = 0;
1805	if (flock->l_len == 0)
1806		flock->l_len = OFF_MAX;
1807
1808	// don't let the offset and length overflow
1809	if (flock->l_start > 0 && OFF_MAX - flock->l_start < flock->l_len)
1810		flock->l_len = OFF_MAX - flock->l_start;
1811
1812	if (flock->l_len < 0) {
1813		// a negative length reverses the region
1814		flock->l_start += flock->l_len;
1815		flock->l_len = -flock->l_len;
1816	}
1817
1818	return B_OK;
1819}
1820
1821
1822static void
1823replace_vnode_if_disconnected(struct fs_mount* mount,
1824	struct vnode* vnodeToDisconnect, struct vnode*& vnode,
1825	struct vnode* fallBack, bool lockRootLock)
1826{
1827	struct vnode* givenVnode = vnode;
1828	bool vnodeReplaced = false;
1829
1830	ReadLocker vnodeReadLocker(sVnodeLock);
1831
1832	if (lockRootLock)
1833		mutex_lock(&sIOContextRootLock);
1834
1835	while (vnode != NULL && vnode->mount == mount
1836		&& (vnodeToDisconnect == NULL || vnodeToDisconnect == vnode)) {
1837		if (vnode->covers != NULL) {
1838			// redirect the vnode to the covered vnode
1839			vnode = vnode->covers;
1840		} else
1841			vnode = fallBack;
1842
1843		vnodeReplaced = true;
1844	}
1845
1846	// If we've replaced the node, grab a reference for the new one.
1847	if (vnodeReplaced && vnode != NULL)
1848		inc_vnode_ref_count(vnode);
1849
1850	if (lockRootLock)
1851		mutex_unlock(&sIOContextRootLock);
1852
1853	vnodeReadLocker.Unlock();
1854
1855	if (vnodeReplaced)
1856		put_vnode(givenVnode);
1857}
1858
1859
1860/*!	Disconnects all file descriptors that are associated with the
1861	\a vnodeToDisconnect, or if this is NULL, all vnodes of the specified
1862	\a mount object.
1863
1864	Note, after you've called this function, there might still be ongoing
1865	accesses - they won't be interrupted if they already happened before.
1866	However, any subsequent access will fail.
1867
1868	This is not a cheap function and should be used with care and rarely.
1869	TODO: there is currently no means to stop a blocking read/write!
1870*/
1871static void
1872disconnect_mount_or_vnode_fds(struct fs_mount* mount,
1873	struct vnode* vnodeToDisconnect)
1874{
1875	// iterate over all teams and peek into their file descriptors
1876	TeamListIterator teamIterator;
1877	while (Team* team = teamIterator.Next()) {
1878		BReference<Team> teamReference(team, true);
1879
1880		// lock the I/O context
1881		io_context* context = team->io_context;
1882		MutexLocker contextLocker(context->io_mutex);
1883
1884		replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->root,
1885			sRoot, true);
1886		replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->cwd,
1887			sRoot, false);
1888
1889		for (uint32 i = 0; i < context->table_size; i++) {
1890			if (struct file_descriptor* descriptor = context->fds[i]) {
1891				inc_fd_ref_count(descriptor);
1892
1893				// if this descriptor points at this mount, we
1894				// need to disconnect it to be able to unmount
1895				struct vnode* vnode = fd_vnode(descriptor);
1896				if (vnodeToDisconnect != NULL) {
1897					if (vnode == vnodeToDisconnect)
1898						disconnect_fd(descriptor);
1899				} else if ((vnode != NULL && vnode->mount == mount)
1900					|| (vnode == NULL && descriptor->u.mount == mount))
1901					disconnect_fd(descriptor);
1902
1903				put_fd(descriptor);
1904			}
1905		}
1906	}
1907}
1908
1909
1910/*!	\brief Gets the root node of the current IO context.
1911	If \a kernel is \c true, the kernel IO context will be used.
1912	The caller obtains a reference to the returned node.
1913*/
1914struct vnode*
1915get_root_vnode(bool kernel)
1916{
1917	if (!kernel) {
1918		// Get current working directory from io context
1919		struct io_context* context = get_current_io_context(kernel);
1920
1921		mutex_lock(&sIOContextRootLock);
1922
1923		struct vnode* root = context->root;
1924		if (root != NULL)
1925			inc_vnode_ref_count(root);
1926
1927		mutex_unlock(&sIOContextRootLock);
1928
1929		if (root != NULL)
1930			return root;
1931
1932		// That should never happen.
1933		dprintf("get_root_vnode(): IO context for team %" B_PRId32 " doesn't "
1934			"have a root\n", team_get_current_team_id());
1935	}
1936
1937	inc_vnode_ref_count(sRoot);
1938	return sRoot;
1939}
1940
1941
1942/*!	\brief Resolves a vnode to the vnode it is covered by, if any.
1943
1944	Given an arbitrary vnode (identified by mount and node ID), the function
1945	checks, whether the vnode is covered by another vnode. If it is, the
1946	function returns the mount and node ID of the covering vnode. Otherwise
1947	it simply returns the supplied mount and node ID.
1948
1949	In case of error (e.g. the supplied node could not be found) the variables
1950	for storing the resolved mount and node ID remain untouched and an error
1951	code is returned.
1952
1953	\param mountID The mount ID of the vnode in question.
1954	\param nodeID The node ID of the vnode in question.
1955	\param resolvedMountID Pointer to storage for the resolved mount ID.
1956	\param resolvedNodeID Pointer to storage for the resolved node ID.
1957	\return
1958	- \c B_OK, if everything went fine,
1959	- another error code, if something went wrong.
1960*/
1961status_t
1962vfs_resolve_vnode_to_covering_vnode(dev_t mountID, ino_t nodeID,
1963	dev_t* resolvedMountID, ino_t* resolvedNodeID)
1964{
1965	// get the node
1966	struct vnode* node;
1967	status_t error = get_vnode(mountID, nodeID, &node, true, false);
1968	if (error != B_OK)
1969		return error;
1970
1971	// resolve the node
1972	if (Vnode* coveringNode = get_covering_vnode(node)) {
1973		put_vnode(node);
1974		node = coveringNode;
1975	}
1976
1977	// set the return values
1978	*resolvedMountID = node->device;
1979	*resolvedNodeID = node->id;
1980
1981	put_vnode(node);
1982
1983	return B_OK;
1984}
1985
1986
1987/*!	\brief Gets the directory path and leaf name for a given path.
1988
1989	The supplied \a path is transformed to refer to the directory part of
1990	the entry identified by the original path, and into the buffer \a filename
1991	the leaf name of the original entry is written.
1992	Neither the returned path nor the leaf name can be expected to be
1993	canonical.
1994
1995	\param path The path to be analyzed. Must be able to store at least one
1996		   additional character.
1997	\param filename The buffer into which the leaf name will be written.
1998		   Must be of size B_FILE_NAME_LENGTH at least.
1999	\return \c B_OK, if everything went fine, \c B_NAME_TOO_LONG, if the leaf
2000		   name is longer than \c B_FILE_NAME_LENGTH, or \c B_ENTRY_NOT_FOUND,
2001		   if the given path name is empty.
2002*/
2003static status_t
2004get_dir_path_and_leaf(char* path, char* filename)
2005{
2006	if (*path == '\0')
2007		return B_ENTRY_NOT_FOUND;
2008
2009	char* last = strrchr(path, '/');
2010		// '/' are not allowed in file names!
2011
2012	FUNCTION(("get_dir_path_and_leaf(path = %s)\n", path));
2013
2014	if (last == NULL) {
2015		// this path is single segment with no '/' in it
2016		// ex. "foo"
2017		if (strlcpy(filename, path, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2018			return B_NAME_TOO_LONG;
2019
2020		strcpy(path, ".");
2021	} else {
2022		last++;
2023		if (last[0] == '\0') {
2024			// special case: the path ends in one or more '/' - remove them
2025			while (*--last == '/' && last != path);
2026			last[1] = '\0';
2027
2028			if (last == path && last[0] == '/') {
2029				// This path points to the root of the file system
2030				strcpy(filename, ".");
2031				return B_OK;
2032			}
2033			for (; last != path && *(last - 1) != '/'; last--);
2034				// rewind to the start of the leaf before the '/'
2035		}
2036
2037		// normal leaf: replace the leaf portion of the path with a '.'
2038		if (strlcpy(filename, last, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2039			return B_NAME_TOO_LONG;
2040
2041		last[0] = '.';
2042		last[1] = '\0';
2043	}
2044	return B_OK;
2045}
2046
2047
2048static status_t
2049entry_ref_to_vnode(dev_t mountID, ino_t directoryID, const char* name,
2050	bool traverse, bool kernel, struct vnode** _vnode)
2051{
2052	char clonedName[B_FILE_NAME_LENGTH + 1];
2053	if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2054		return B_NAME_TOO_LONG;
2055
2056	// get the directory vnode and let vnode_path_to_vnode() do the rest
2057	struct vnode* directory;
2058
2059	status_t status = get_vnode(mountID, directoryID, &directory, true, false);
2060	if (status < 0)
2061		return status;
2062
2063	return vnode_path_to_vnode(directory, clonedName, traverse, 0, kernel,
2064		_vnode, NULL);
2065}
2066
2067
2068/*!	Looks up the entry with name \a name in the directory represented by \a dir
2069	and returns the respective vnode.
2070	On success a reference to the vnode is acquired for the caller.
2071*/
2072static status_t
2073lookup_dir_entry(struct vnode* dir, const char* name, struct vnode** _vnode)
2074{
2075	ino_t id;
2076
2077	if (dir->mount->entry_cache.Lookup(dir->id, name, id))
2078		return get_vnode(dir->device, id, _vnode, true, false);
2079
2080	status_t status = FS_CALL(dir, lookup, name, &id);
2081	if (status != B_OK)
2082		return status;
2083
2084	// The lookup() hook call get_vnode() or publish_vnode(), so we do already
2085	// have a reference and just need to look the node up.
2086	rw_lock_read_lock(&sVnodeLock);
2087	*_vnode = lookup_vnode(dir->device, id);
2088	rw_lock_read_unlock(&sVnodeLock);
2089
2090	if (*_vnode == NULL) {
2091		panic("lookup_dir_entry(): could not lookup vnode (mountid 0x%" B_PRIx32
2092			" vnid 0x%" B_PRIx64 ")\n", dir->device, id);
2093		return B_ENTRY_NOT_FOUND;
2094	}
2095
2096//	ktrace_printf("lookup_dir_entry(): dir: %p (%ld, %lld), name: \"%s\" -> "
2097//		"%p (%ld, %lld)", dir, dir->mount->id, dir->id, name, *_vnode,
2098//		(*_vnode)->mount->id, (*_vnode)->id);
2099
2100	return B_OK;
2101}
2102
2103
2104/*!	Returns the vnode for the relative path starting at the specified \a vnode.
2105	\a path must not be NULL.
2106	If it returns successfully, \a path contains the name of the last path
2107	component. This function clobbers the buffer pointed to by \a path only
2108	if it does contain more than one component.
2109	Note, this reduces the ref_count of the starting \a vnode, no matter if
2110	it is successful or not!
2111*/
2112static status_t
2113vnode_path_to_vnode(struct vnode* vnode, char* path, bool traverseLeafLink,
2114	int count, struct io_context* ioContext, struct vnode** _vnode,
2115	ino_t* _parentID)
2116{
2117	status_t status = B_OK;
2118	ino_t lastParentID = vnode->id;
2119
2120	FUNCTION(("vnode_path_to_vnode(vnode = %p, path = %s)\n", vnode, path));
2121
2122	if (path == NULL) {
2123		put_vnode(vnode);
2124		return B_BAD_VALUE;
2125	}
2126
2127	if (*path == '\0') {
2128		put_vnode(vnode);
2129		return B_ENTRY_NOT_FOUND;
2130	}
2131
2132	while (true) {
2133		struct vnode* nextVnode;
2134		char* nextPath;
2135
2136		TRACE(("vnode_path_to_vnode: top of loop. p = %p, p = '%s'\n", path,
2137			path));
2138
2139		// done?
2140		if (path[0] == '\0')
2141			break;
2142
2143		// walk to find the next path component ("path" will point to a single
2144		// path component), and filter out multiple slashes
2145		for (nextPath = path + 1; *nextPath != '\0' && *nextPath != '/';
2146				nextPath++);
2147
2148		if (*nextPath == '/') {
2149			*nextPath = '\0';
2150			do
2151				nextPath++;
2152			while (*nextPath == '/');
2153		}
2154
2155		// See if the '..' is at a covering vnode move to the covered
2156		// vnode so we pass the '..' path to the underlying filesystem.
2157		// Also prevent breaking the root of the IO context.
2158		if (strcmp("..", path) == 0) {
2159			if (vnode == ioContext->root) {
2160				// Attempted prison break! Keep it contained.
2161				path = nextPath;
2162				continue;
2163			}
2164
2165			if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2166				nextVnode = coveredVnode;
2167				put_vnode(vnode);
2168				vnode = nextVnode;
2169			}
2170		}
2171
2172		// check if vnode is really a directory
2173		if (status == B_OK && !S_ISDIR(vnode->Type()))
2174			status = B_NOT_A_DIRECTORY;
2175
2176		// Check if we have the right to search the current directory vnode.
2177		// If a file system doesn't have the access() function, we assume that
2178		// searching a directory is always allowed
2179		if (status == B_OK && HAS_FS_CALL(vnode, access))
2180			status = FS_CALL(vnode, access, X_OK);
2181
2182		// Tell the filesystem to get the vnode of this path component (if we
2183		// got the permission from the call above)
2184		if (status == B_OK)
2185			status = lookup_dir_entry(vnode, path, &nextVnode);
2186
2187		if (status != B_OK) {
2188			put_vnode(vnode);
2189			return status;
2190		}
2191
2192		// If the new node is a symbolic link, resolve it (if we've been told
2193		// to do it)
2194		if (S_ISLNK(nextVnode->Type())
2195			&& (traverseLeafLink || nextPath[0] != '\0')) {
2196			size_t bufferSize;
2197			char* buffer;
2198
2199			TRACE(("traverse link\n"));
2200
2201			// it's not exactly nice style using goto in this way, but hey,
2202			// it works :-/
2203			if (count + 1 > B_MAX_SYMLINKS) {
2204				status = B_LINK_LIMIT;
2205				goto resolve_link_error;
2206			}
2207
2208			buffer = (char*)malloc(bufferSize = B_PATH_NAME_LENGTH);
2209			if (buffer == NULL) {
2210				status = B_NO_MEMORY;
2211				goto resolve_link_error;
2212			}
2213
2214			if (HAS_FS_CALL(nextVnode, read_symlink)) {
2215				bufferSize--;
2216				status = FS_CALL(nextVnode, read_symlink, buffer, &bufferSize);
2217				// null-terminate
2218				if (status >= 0)
2219					buffer[bufferSize] = '\0';
2220			} else
2221				status = B_BAD_VALUE;
2222
2223			if (status != B_OK) {
2224				free(buffer);
2225
2226		resolve_link_error:
2227				put_vnode(vnode);
2228				put_vnode(nextVnode);
2229
2230				return status;
2231			}
2232			put_vnode(nextVnode);
2233
2234			// Check if we start from the root directory or the current
2235			// directory ("vnode" still points to that one).
2236			// Cut off all leading slashes if it's the root directory
2237			path = buffer;
2238			bool absoluteSymlink = false;
2239			if (path[0] == '/') {
2240				// we don't need the old directory anymore
2241				put_vnode(vnode);
2242
2243				while (*++path == '/')
2244					;
2245
2246				mutex_lock(&sIOContextRootLock);
2247				vnode = ioContext->root;
2248				inc_vnode_ref_count(vnode);
2249				mutex_unlock(&sIOContextRootLock);
2250
2251				absoluteSymlink = true;
2252			}
2253
2254			inc_vnode_ref_count(vnode);
2255				// balance the next recursion - we will decrement the
2256				// ref_count of the vnode, no matter if we succeeded or not
2257
2258			if (absoluteSymlink && *path == '\0') {
2259				// symlink was just "/"
2260				nextVnode = vnode;
2261			} else {
2262				status = vnode_path_to_vnode(vnode, path, true, count + 1,
2263					ioContext, &nextVnode, &lastParentID);
2264			}
2265
2266			free(buffer);
2267
2268			if (status != B_OK) {
2269				put_vnode(vnode);
2270				return status;
2271			}
2272		} else
2273			lastParentID = vnode->id;
2274
2275		// decrease the ref count on the old dir we just looked up into
2276		put_vnode(vnode);
2277
2278		path = nextPath;
2279		vnode = nextVnode;
2280
2281		// see if we hit a covered node
2282		if (Vnode* coveringNode = get_covering_vnode(vnode)) {
2283			put_vnode(vnode);
2284			vnode = coveringNode;
2285		}
2286	}
2287
2288	*_vnode = vnode;
2289	if (_parentID)
2290		*_parentID = lastParentID;
2291
2292	return B_OK;
2293}
2294
2295
2296static status_t
2297vnode_path_to_vnode(struct vnode* vnode, char* path, bool traverseLeafLink,
2298	int count, bool kernel, struct vnode** _vnode, ino_t* _parentID)
2299{
2300	return vnode_path_to_vnode(vnode, path, traverseLeafLink, count,
2301		get_current_io_context(kernel), _vnode, _parentID);
2302}
2303
2304
2305static status_t
2306path_to_vnode(char* path, bool traverseLink, struct vnode** _vnode,
2307	ino_t* _parentID, bool kernel)
2308{
2309	struct vnode* start = NULL;
2310
2311	FUNCTION(("path_to_vnode(path = \"%s\")\n", path));
2312
2313	if (!path)
2314		return B_BAD_VALUE;
2315
2316	if (*path == '\0')
2317		return B_ENTRY_NOT_FOUND;
2318
2319	// figure out if we need to start at root or at cwd
2320	if (*path == '/') {
2321		if (sRoot == NULL) {
2322			// we're a bit early, aren't we?
2323			return B_ERROR;
2324		}
2325
2326		while (*++path == '/')
2327			;
2328		start = get_root_vnode(kernel);
2329
2330		if (*path == '\0') {
2331			*_vnode = start;
2332			return B_OK;
2333		}
2334
2335	} else {
2336		struct io_context* context = get_current_io_context(kernel);
2337
2338		mutex_lock(&context->io_mutex);
2339		start = context->cwd;
2340		if (start != NULL)
2341			inc_vnode_ref_count(start);
2342		mutex_unlock(&context->io_mutex);
2343
2344		if (start == NULL)
2345			return B_ERROR;
2346	}
2347
2348	return vnode_path_to_vnode(start, path, traverseLink, 0, kernel, _vnode,
2349		_parentID);
2350}
2351
2352
2353/*! Returns the vnode in the next to last segment of the path, and returns
2354	the last portion in filename.
2355	The path buffer must be able to store at least one additional character.
2356*/
2357static status_t
2358path_to_dir_vnode(char* path, struct vnode** _vnode, char* filename,
2359	bool kernel)
2360{
2361	status_t status = get_dir_path_and_leaf(path, filename);
2362	if (status != B_OK)
2363		return status;
2364
2365	return path_to_vnode(path, true, _vnode, NULL, kernel);
2366}
2367
2368
2369/*!	\brief Retrieves the directory vnode and the leaf name of an entry referred
2370		   to by a FD + path pair.
2371
2372	\a path must be given in either case. \a fd might be omitted, in which
2373	case \a path is either an absolute path or one relative to the current
2374	directory. If both a supplied and \a path is relative it is reckoned off
2375	of the directory referred to by \a fd. If \a path is absolute \a fd is
2376	ignored.
2377
2378	The caller has the responsibility to call put_vnode() on the returned
2379	directory vnode.
2380
2381	\param fd The FD. May be < 0.
2382	\param path The absolute or relative path. Must not be \c NULL. The buffer
2383	       is modified by this function. It must have at least room for a
2384	       string one character longer than the path it contains.
2385	\param _vnode A pointer to a variable the directory vnode shall be written
2386		   into.
2387	\param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2388		   the leaf name of the specified entry will be written.
2389	\param kernel \c true, if invoked from inside the kernel, \c false if
2390		   invoked from userland.
2391	\return \c B_OK, if everything went fine, another error code otherwise.
2392*/
2393static status_t
2394fd_and_path_to_dir_vnode(int fd, char* path, struct vnode** _vnode,
2395	char* filename, bool kernel)
2396{
2397	if (!path)
2398		return B_BAD_VALUE;
2399	if (*path == '\0')
2400		return B_ENTRY_NOT_FOUND;
2401	if (fd < 0)
2402		return path_to_dir_vnode(path, _vnode, filename, kernel);
2403
2404	status_t status = get_dir_path_and_leaf(path, filename);
2405	if (status != B_OK)
2406		return status;
2407
2408	return fd_and_path_to_vnode(fd, path, true, _vnode, NULL, kernel);
2409}
2410
2411
2412/*!	\brief Retrieves the directory vnode and the leaf name of an entry referred
2413		   to by a vnode + path pair.
2414
2415	\a path must be given in either case. \a vnode might be omitted, in which
2416	case \a path is either an absolute path or one relative to the current
2417	directory. If both a supplied and \a path is relative it is reckoned off
2418	of the directory referred to by \a vnode. If \a path is absolute \a vnode is
2419	ignored.
2420
2421	The caller has the responsibility to call put_vnode() on the returned
2422	directory vnode.
2423
2424	\param vnode The vnode. May be \c NULL.
2425	\param path The absolute or relative path. Must not be \c NULL. The buffer
2426	       is modified by this function. It must have at least room for a
2427	       string one character longer than the path it contains.
2428	\param _vnode A pointer to a variable the directory vnode shall be written
2429		   into.
2430	\param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2431		   the leaf name of the specified entry will be written.
2432	\param kernel \c true, if invoked from inside the kernel, \c false if
2433		   invoked from userland.
2434	\return \c B_OK, if everything went fine, another error code otherwise.
2435*/
2436static status_t
2437vnode_and_path_to_dir_vnode(struct vnode* vnode, char* path,
2438	struct vnode** _vnode, char* filename, bool kernel)
2439{
2440	if (!path)
2441		return B_BAD_VALUE;
2442	if (*path == '\0')
2443		return B_ENTRY_NOT_FOUND;
2444	if (vnode == NULL || path[0] == '/')
2445		return path_to_dir_vnode(path, _vnode, filename, kernel);
2446
2447	status_t status = get_dir_path_and_leaf(path, filename);
2448	if (status != B_OK)
2449		return status;
2450
2451	inc_vnode_ref_count(vnode);
2452		// vnode_path_to_vnode() always decrements the ref count
2453
2454	return vnode_path_to_vnode(vnode, path, true, 0, kernel, _vnode, NULL);
2455}
2456
2457
2458/*! Returns a vnode's name in the d_name field of a supplied dirent buffer.
2459*/
2460static status_t
2461get_vnode_name(struct vnode* vnode, struct vnode* parent, struct dirent* buffer,
2462	size_t bufferSize, struct io_context* ioContext)
2463{
2464	if (bufferSize < sizeof(struct dirent))
2465		return B_BAD_VALUE;
2466
2467	// See if the vnode is convering another vnode and move to the covered
2468	// vnode so we get the underlying file system
2469	VNodePutter vnodePutter;
2470	if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2471		vnode = coveredVnode;
2472		vnodePutter.SetTo(vnode);
2473	}
2474
2475	if (HAS_FS_CALL(vnode, get_vnode_name)) {
2476		// The FS supports getting the name of a vnode.
2477		if (FS_CALL(vnode, get_vnode_name, buffer->d_name,
2478			(char*)buffer + bufferSize - buffer->d_name) == B_OK)
2479			return B_OK;
2480	}
2481
2482	// The FS doesn't support getting the name of a vnode. So we search the
2483	// parent directory for the vnode, if the caller let us.
2484
2485	if (parent == NULL || !HAS_FS_CALL(parent, read_dir))
2486		return B_UNSUPPORTED;
2487
2488	void* cookie;
2489
2490	status_t status = FS_CALL(parent, open_dir, &cookie);
2491	if (status >= B_OK) {
2492		while (true) {
2493			uint32 num = 1;
2494			// We use the FS hook directly instead of dir_read(), since we don't
2495			// want the entries to be fixed. We have already resolved vnode to
2496			// the covered node.
2497			status = FS_CALL(parent, read_dir, cookie, buffer, bufferSize,
2498				&num);
2499			if (status != B_OK)
2500				break;
2501			if (num == 0) {
2502				status = B_ENTRY_NOT_FOUND;
2503				break;
2504			}
2505
2506			if (vnode->id == buffer->d_ino) {
2507				// found correct entry!
2508				break;
2509			}
2510		}
2511
2512		FS_CALL(vnode, close_dir, cookie);
2513		FS_CALL(vnode, free_dir_cookie, cookie);
2514	}
2515	return status;
2516}
2517
2518
2519static status_t
2520get_vnode_name(struct vnode* vnode, struct vnode* parent, char* name,
2521	size_t nameSize, bool kernel)
2522{
2523	char buffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
2524	struct dirent* dirent = (struct dirent*)buffer;
2525
2526	status_t status = get_vnode_name(vnode, parent, dirent, sizeof(buffer),
2527		get_current_io_context(kernel));
2528	if (status != B_OK)
2529		return status;
2530
2531	if (strlcpy(name, dirent->d_name, nameSize) >= nameSize)
2532		return B_BUFFER_OVERFLOW;
2533
2534	return B_OK;
2535}
2536
2537
2538/*!	Gets the full path to a given directory vnode.
2539	It uses the fs_get_vnode_name() call to get the name of a vnode; if a
2540	file system doesn't support this call, it will fall back to iterating
2541	through the parent directory to get the name of the child.
2542
2543	To protect against circular loops, it supports a maximum tree depth
2544	of 256 levels.
2545
2546	Note that the path may not be correct the time this function returns!
2547	It doesn't use any locking to prevent returning the correct path, as
2548	paths aren't safe anyway: the path to a file can change at any time.
2549
2550	It might be a good idea, though, to check if the returned path exists
2551	in the calling function (it's not done here because of efficiency)
2552*/
2553static status_t
2554dir_vnode_to_path(struct vnode* vnode, char* buffer, size_t bufferSize,
2555	bool kernel)
2556{
2557	FUNCTION(("dir_vnode_to_path(%p, %p, %lu)\n", vnode, buffer, bufferSize));
2558
2559	if (vnode == NULL || buffer == NULL || bufferSize == 0)
2560		return B_BAD_VALUE;
2561
2562	if (!S_ISDIR(vnode->Type()))
2563		return B_NOT_A_DIRECTORY;
2564
2565	char* path = buffer;
2566	int32 insert = bufferSize;
2567	int32 maxLevel = 256;
2568	int32 length;
2569	status_t status;
2570	struct io_context* ioContext = get_current_io_context(kernel);
2571
2572	// we don't use get_vnode() here because this call is more
2573	// efficient and does all we need from get_vnode()
2574	inc_vnode_ref_count(vnode);
2575
2576	if (vnode != ioContext->root) {
2577		// we don't hit the IO context root
2578		// resolve a vnode to its covered vnode
2579		if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2580			put_vnode(vnode);
2581			vnode = coveredVnode;
2582		}
2583	}
2584
2585	path[--insert] = '\0';
2586		// the path is filled right to left
2587
2588	while (true) {
2589		// the name buffer is also used for fs_read_dir()
2590		char nameBuffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
2591		char* name = &((struct dirent*)nameBuffer)->d_name[0];
2592		struct vnode* parentVnode;
2593
2594		// lookup the parent vnode
2595		if (vnode == ioContext->root) {
2596			// we hit the IO context root
2597			parentVnode = vnode;
2598			inc_vnode_ref_count(vnode);
2599		} else {
2600			status = lookup_dir_entry(vnode, "..", &parentVnode);
2601			if (status != B_OK)
2602				goto out;
2603		}
2604
2605		// get the node's name
2606		status = get_vnode_name(vnode, parentVnode, (struct dirent*)nameBuffer,
2607			sizeof(nameBuffer), ioContext);
2608
2609		if (vnode != ioContext->root) {
2610			// we don't hit the IO context root
2611			// resolve a vnode to its covered vnode
2612			if (Vnode* coveredVnode = get_covered_vnode(parentVnode)) {
2613				put_vnode(parentVnode);
2614				parentVnode = coveredVnode;
2615			}
2616		}
2617
2618		bool hitRoot = (parentVnode == vnode);
2619
2620		// release the current vnode, we only need its parent from now on
2621		put_vnode(vnode);
2622		vnode = parentVnode;
2623
2624		if (status != B_OK)
2625			goto out;
2626
2627		if (hitRoot) {
2628			// we have reached "/", which means we have constructed the full
2629			// path
2630			break;
2631		}
2632
2633		// TODO: add an explicit check for loops in about 10 levels to do
2634		// real loop detection
2635
2636		// don't go deeper as 'maxLevel' to prevent circular loops
2637		if (maxLevel-- < 0) {
2638			status = B_LINK_LIMIT;
2639			goto out;
2640		}
2641
2642		// add the name in front of the current path
2643		name[B_FILE_NAME_LENGTH - 1] = '\0';
2644		length = strlen(name);
2645		insert -= length;
2646		if (insert <= 0) {
2647			status = B_RESULT_NOT_REPRESENTABLE;
2648			goto out;
2649		}
2650		memcpy(path + insert, name, length);
2651		path[--insert] = '/';
2652	}
2653
2654	// the root dir will result in an empty path: fix it
2655	if (path[insert] == '\0')
2656		path[--insert] = '/';
2657
2658	TRACE(("  path is: %s\n", path + insert));
2659
2660	// move the path to the start of the buffer
2661	length = bufferSize - insert;
2662	memmove(buffer, path + insert, length);
2663
2664out:
2665	put_vnode(vnode);
2666	return status;
2667}
2668
2669
2670/*!	Checks the length of every path component, and adds a '.'
2671	if the path ends in a slash.
2672	The given path buffer must be able to store at least one
2673	additional character.
2674*/
2675static status_t
2676check_path(char* to)
2677{
2678	int32 length = 0;
2679
2680	// check length of every path component
2681
2682	while (*to) {
2683		char* begin;
2684		if (*to == '/')
2685			to++, length++;
2686
2687		begin = to;
2688		while (*to != '/' && *to)
2689			to++, length++;
2690
2691		if (to - begin > B_FILE_NAME_LENGTH)
2692			return B_NAME_TOO_LONG;
2693	}
2694
2695	if (length == 0)
2696		return B_ENTRY_NOT_FOUND;
2697
2698	// complete path if there is a slash at the end
2699
2700	if (*(to - 1) == '/') {
2701		if (length > B_PATH_NAME_LENGTH - 2)
2702			return B_NAME_TOO_LONG;
2703
2704		to[0] = '.';
2705		to[1] = '\0';
2706	}
2707
2708	return B_OK;
2709}
2710
2711
2712static struct file_descriptor*
2713get_fd_and_vnode(int fd, struct vnode** _vnode, bool kernel)
2714{
2715	struct file_descriptor* descriptor
2716		= get_fd(get_current_io_context(kernel), fd);
2717	if (descriptor == NULL)
2718		return NULL;
2719
2720	struct vnode* vnode = fd_vnode(descriptor);
2721	if (vnode == NULL) {
2722		put_fd(descriptor);
2723		return NULL;
2724	}
2725
2726	// ToDo: when we can close a file descriptor at any point, investigate
2727	//	if this is still valid to do (accessing the vnode without ref_count
2728	//	or locking)
2729	*_vnode = vnode;
2730	return descriptor;
2731}
2732
2733
2734static struct vnode*
2735get_vnode_from_fd(int fd, bool kernel)
2736{
2737	struct file_descriptor* descriptor;
2738	struct vnode* vnode;
2739
2740	descriptor = get_fd(get_current_io_context(kernel), fd);
2741	if (descriptor == NULL)
2742		return NULL;
2743
2744	vnode = fd_vnode(descriptor);
2745	if (vnode != NULL)
2746		inc_vnode_ref_count(vnode);
2747
2748	put_fd(descriptor);
2749	return vnode;
2750}
2751
2752
2753/*!	Gets the vnode from an FD + path combination. If \a fd is lower than zero,
2754	only the path will be considered. In this case, the \a path must not be
2755	NULL.
2756	If \a fd is a valid file descriptor, \a path may be NULL for directories,
2757	and should be NULL for files.
2758*/
2759static status_t
2760fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
2761	struct vnode** _vnode, ino_t* _parentID, bool kernel)
2762{
2763	if (fd < 0 && !path)
2764		return B_BAD_VALUE;
2765
2766	if (path != NULL && *path == '\0')
2767		return B_ENTRY_NOT_FOUND;
2768
2769	if (fd < 0 || (path != NULL && path[0] == '/')) {
2770		// no FD or absolute path
2771		return path_to_vnode(path, traverseLeafLink, _vnode, _parentID, kernel);
2772	}
2773
2774	// FD only, or FD + relative path
2775	struct vnode* vnode = get_vnode_from_fd(fd, kernel);
2776	if (!vnode)
2777		return B_FILE_ERROR;
2778
2779	if (path != NULL) {
2780		return vnode_path_to_vnode(vnode, path, traverseLeafLink, 0, kernel,
2781			_vnode, _parentID);
2782	}
2783
2784	// there is no relative path to take into account
2785
2786	*_vnode = vnode;
2787	if (_parentID)
2788		*_parentID = -1;
2789
2790	return B_OK;
2791}
2792
2793
2794static int
2795get_new_fd(int type, struct fs_mount* mount, struct vnode* vnode,
2796	void* cookie, int openMode, bool kernel)
2797{
2798	struct file_descriptor* descriptor;
2799	int fd;
2800
2801	// If the vnode is locked, we don't allow creating a new file/directory
2802	// file_descriptor for it
2803	if (vnode && vnode->mandatory_locked_by != NULL
2804		&& (type == FDTYPE_FILE || type == FDTYPE_DIR))
2805		return B_BUSY;
2806
2807	descriptor = alloc_fd();
2808	if (!descriptor)
2809		return B_NO_MEMORY;
2810
2811	if (vnode)
2812		descriptor->u.vnode = vnode;
2813	else
2814		descriptor->u.mount = mount;
2815	descriptor->cookie = cookie;
2816
2817	switch (type) {
2818		// vnode types
2819		case FDTYPE_FILE:
2820			descriptor->ops = &sFileOps;
2821			break;
2822		case FDTYPE_DIR:
2823			descriptor->ops = &sDirectoryOps;
2824			break;
2825		case FDTYPE_ATTR:
2826			descriptor->ops = &sAttributeOps;
2827			break;
2828		case FDTYPE_ATTR_DIR:
2829			descriptor->ops = &sAttributeDirectoryOps;
2830			break;
2831
2832		// mount types
2833		case FDTYPE_INDEX_DIR:
2834			descriptor->ops = &sIndexDirectoryOps;
2835			break;
2836		case FDTYPE_QUERY:
2837			descriptor->ops = &sQueryOps;
2838			break;
2839
2840		default:
2841			panic("get_new_fd() called with unknown type %d\n", type);
2842			break;
2843	}
2844	descriptor->type = type;
2845	descriptor->open_mode = openMode;
2846
2847	io_context* context = get_current_io_context(kernel);
2848	fd = new_fd(context, descriptor);
2849	if (fd < 0) {
2850		free(descriptor);
2851		return B_NO_MORE_FDS;
2852	}
2853
2854	mutex_lock(&context->io_mutex);
2855	fd_set_close_on_exec(context, fd, (openMode & O_CLOEXEC) != 0);
2856	mutex_unlock(&context->io_mutex);
2857
2858	return fd;
2859}
2860
2861
2862/*!	In-place normalizes \a path. It's otherwise semantically equivalent to
2863	vfs_normalize_path(). See there for more documentation.
2864*/
2865static status_t
2866normalize_path(char* path, size_t pathSize, bool traverseLink, bool kernel)
2867{
2868	VNodePutter dirPutter;
2869	struct vnode* dir = NULL;
2870	status_t error;
2871
2872	for (int i = 0; i < B_MAX_SYMLINKS; i++) {
2873		// get dir vnode + leaf name
2874		struct vnode* nextDir;
2875		char leaf[B_FILE_NAME_LENGTH];
2876		error = vnode_and_path_to_dir_vnode(dir, path, &nextDir, leaf, kernel);
2877		if (error != B_OK)
2878			return error;
2879
2880		dir = nextDir;
2881		strcpy(path, leaf);
2882		dirPutter.SetTo(dir);
2883
2884		// get file vnode, if we shall resolve links
2885		bool fileExists = false;
2886		struct vnode* fileVnode;
2887		VNodePutter fileVnodePutter;
2888		if (traverseLink) {
2889			inc_vnode_ref_count(dir);
2890			if (vnode_path_to_vnode(dir, path, false, 0, kernel, &fileVnode,
2891					NULL) == B_OK) {
2892				fileVnodePutter.SetTo(fileVnode);
2893				fileExists = true;
2894			}
2895		}
2896
2897		if (!fileExists || !traverseLink || !S_ISLNK(fileVnode->Type())) {
2898			// we're done -- construct the path
2899			bool hasLeaf = true;
2900			if (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0) {
2901				// special cases "." and ".." -- get the dir, forget the leaf
2902				inc_vnode_ref_count(dir);
2903				error = vnode_path_to_vnode(dir, leaf, false, 0, kernel,
2904					&nextDir, NULL);
2905				if (error != B_OK)
2906					return error;
2907				dir = nextDir;
2908				dirPutter.SetTo(dir);
2909				hasLeaf = false;
2910			}
2911
2912			// get the directory path
2913			error = dir_vnode_to_path(dir, path, B_PATH_NAME_LENGTH, kernel);
2914			if (error != B_OK)
2915				return error;
2916
2917			// append the leaf name
2918			if (hasLeaf) {
2919				// insert a directory separator if this is not the file system
2920				// root
2921				if ((strcmp(path, "/") != 0
2922					&& strlcat(path, "/", pathSize) >= pathSize)
2923					|| strlcat(path, leaf, pathSize) >= pathSize) {
2924					return B_NAME_TOO_LONG;
2925				}
2926			}
2927
2928			return B_OK;
2929		}
2930
2931		// read link
2932		if (HAS_FS_CALL(fileVnode, read_symlink)) {
2933			size_t bufferSize = B_PATH_NAME_LENGTH - 1;
2934			error = FS_CALL(fileVnode, read_symlink, path, &bufferSize);
2935			if (error != B_OK)
2936				return error;
2937			path[bufferSize] = '\0';
2938		} else
2939			return B_BAD_VALUE;
2940	}
2941
2942	return B_LINK_LIMIT;
2943}
2944
2945
2946#ifdef ADD_DEBUGGER_COMMANDS
2947
2948
2949static void
2950_dump_advisory_locking(advisory_locking* locking)
2951{
2952	if (locking == NULL)
2953		return;
2954
2955	kprintf("   lock:        %" B_PRId32, locking->lock);
2956	kprintf("   wait_sem:    %" B_PRId32, locking->wait_sem);
2957
2958	int32 index = 0;
2959	LockList::Iterator iterator = locking->locks.GetIterator();
2960	while (iterator.HasNext()) {
2961		struct advisory_lock* lock = iterator.Next();
2962
2963		kprintf("   [%2" B_PRId32 "] team:   %" B_PRId32 "\n", index++, lock->team);
2964		kprintf("        start:  %" B_PRIdOFF "\n", lock->start);
2965		kprintf("        end:    %" B_PRIdOFF "\n", lock->end);
2966		kprintf("        shared? %s\n", lock->shared ? "yes" : "no");
2967	}
2968}
2969
2970
2971static void
2972_dump_mount(struct fs_mount* mount)
2973{
2974	kprintf("MOUNT: %p\n", mount);
2975	kprintf(" id:            %" B_PRIdDEV "\n", mount->id);
2976	kprintf(" device_name:   %s\n", mount->device_name);
2977	kprintf(" root_vnode:    %p\n", mount->root_vnode);
2978	kprintf(" covers:        %p\n", mount->root_vnode->covers);
2979	kprintf(" partition:     %p\n", mount->partition);
2980	kprintf(" lock:          %p\n", &mount->rlock);
2981	kprintf(" flags:        %s%s\n", mount->unmounting ? " unmounting" : "",
2982		mount->owns_file_device ? " owns_file_device" : "");
2983
2984	fs_volume* volume = mount->volume;
2985	while (volume != NULL) {
2986		kprintf(" volume %p:\n", volume);
2987		kprintf("  layer:            %" B_PRId32 "\n", volume->layer);
2988		kprintf("  private_volume:   %p\n", volume->private_volume);
2989		kprintf("  ops:              %p\n", volume->ops);
2990		kprintf("  file_system:      %p\n", volume->file_system);
2991		kprintf("  file_system_name: %s\n", volume->file_system_name);
2992		volume = volume->super_volume;
2993	}
2994
2995	set_debug_variable("_volume", (addr_t)mount->volume->private_volume);
2996	set_debug_variable("_root", (addr_t)mount->root_vnode);
2997	set_debug_variable("_covers", (addr_t)mount->root_vnode->covers);
2998	set_debug_variable("_partition", (addr_t)mount->partition);
2999}
3000
3001
3002static bool
3003debug_prepend_vnode_name_to_path(char* buffer, size_t& bufferSize,
3004	const char* name)
3005{
3006	bool insertSlash = buffer[bufferSize] != '\0';
3007	size_t nameLength = strlen(name);
3008
3009	if (bufferSize < nameLength + (insertSlash ? 1 : 0))
3010		return false;
3011
3012	if (insertSlash)
3013		buffer[--bufferSize] = '/';
3014
3015	bufferSize -= nameLength;
3016	memcpy(buffer + bufferSize, name, nameLength);
3017
3018	return true;
3019}
3020
3021
3022static bool
3023debug_prepend_vnode_id_to_path(char* buffer, size_t& bufferSize, dev_t devID,
3024	ino_t nodeID)
3025{
3026	if (bufferSize == 0)
3027		return false;
3028
3029	bool insertSlash = buffer[bufferSize] != '\0';
3030	if (insertSlash)
3031		buffer[--bufferSize] = '/';
3032
3033	size_t size = snprintf(buffer, bufferSize,
3034		"<%" B_PRIdDEV ",%" B_PRIdINO ">", devID, nodeID);
3035	if (size > bufferSize) {
3036		if (insertSlash)
3037			bufferSize++;
3038		return false;
3039	}
3040
3041	if (size < bufferSize)
3042		memmove(buffer + bufferSize - size, buffer, size);
3043
3044	bufferSize -= size;
3045	return true;
3046}
3047
3048
3049static char*
3050debug_resolve_vnode_path(struct vnode* vnode, char* buffer, size_t bufferSize,
3051	bool& _truncated)
3052{
3053	// null-terminate the path
3054	buffer[--bufferSize] = '\0';
3055
3056	while (true) {
3057		while (vnode->covers != NULL)
3058			vnode = vnode->covers;
3059
3060		if (vnode == sRoot) {
3061			_truncated = bufferSize == 0;
3062			if (!_truncated)
3063				buffer[--bufferSize] = '/';
3064			return buffer + bufferSize;
3065		}
3066
3067		// resolve the name
3068		ino_t dirID;
3069		const char* name = vnode->mount->entry_cache.DebugReverseLookup(
3070			vnode->id, dirID);
3071		if (name == NULL) {
3072			// Failed to resolve the name -- prepend "<dev,node>/".
3073			_truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3074				vnode->mount->id, vnode->id);
3075			return buffer + bufferSize;
3076		}
3077
3078		// prepend the name
3079		if (!debug_prepend_vnode_name_to_path(buffer, bufferSize, name)) {
3080			_truncated = true;
3081			return buffer + bufferSize;
3082		}
3083
3084		// resolve the directory node
3085		struct vnode* nextVnode = lookup_vnode(vnode->mount->id, dirID);
3086		if (nextVnode == NULL) {
3087			_truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3088				vnode->mount->id, dirID);
3089			return buffer + bufferSize;
3090		}
3091
3092		vnode = nextVnode;
3093	}
3094}
3095
3096
3097static void
3098_dump_vnode(struct vnode* vnode, bool printPath)
3099{
3100	kprintf("VNODE: %p\n", vnode);
3101	kprintf(" device:        %" B_PRIdDEV "\n", vnode->device);
3102	kprintf(" id:            %" B_PRIdINO "\n", vnode->id);
3103	kprintf(" ref_count:     %" B_PRId32 "\n", vnode->ref_count);
3104	kprintf(" private_node:  %p\n", vnode->private_node);
3105	kprintf(" mount:         %p\n", vnode->mount);
3106	kprintf(" covered_by:    %p\n", vnode->covered_by);
3107	kprintf(" covers:        %p\n", vnode->covers);
3108	kprintf(" cache:         %p\n", vnode->cache);
3109	kprintf(" type:          %#" B_PRIx32 "\n", vnode->Type());
3110	kprintf(" flags:         %s%s%s\n", vnode->IsRemoved() ? "r" : "-",
3111		vnode->IsBusy() ? "b" : "-", vnode->IsUnpublished() ? "u" : "-");
3112	kprintf(" advisory_lock: %p\n", vnode->advisory_locking);
3113
3114	_dump_advisory_locking(vnode->advisory_locking);
3115
3116	if (printPath) {
3117		void* buffer = debug_malloc(B_PATH_NAME_LENGTH);
3118		if (buffer != NULL) {
3119			bool truncated;
3120			char* path = debug_resolve_vnode_path(vnode, (char*)buffer,
3121				B_PATH_NAME_LENGTH, truncated);
3122			if (path != NULL) {
3123				kprintf(" path:          ");
3124				if (truncated)
3125					kputs("<truncated>/");
3126				kputs(path);
3127				kputs("\n");
3128			} else
3129				kprintf("Failed to resolve vnode path.\n");
3130
3131			debug_free(buffer);
3132		} else
3133			kprintf("Failed to allocate memory for constructing the path.\n");
3134	}
3135
3136	set_debug_variable("_node", (addr_t)vnode->private_node);
3137	set_debug_variable("_mount", (addr_t)vnode->mount);
3138	set_debug_variable("_covered_by", (addr_t)vnode->covered_by);
3139	set_debug_variable("_covers", (addr_t)vnode->covers);
3140	set_debug_variable("_adv_lock", (addr_t)vnode->advisory_locking);
3141}
3142
3143
3144static int
3145dump_mount(int argc, char** argv)
3146{
3147	if (argc != 2 || !strcmp(argv[1], "--help")) {
3148		kprintf("usage: %s [id|address]\n", argv[0]);
3149		return 0;
3150	}
3151
3152	ulong val = parse_expression(argv[1]);
3153	uint32 id = val;
3154
3155	struct fs_mount* mount = (fs_mount*)hash_lookup(sMountsTable, (void*)&id);
3156	if (mount == NULL) {
3157		if (IS_USER_ADDRESS(id)) {
3158			kprintf("fs_mount not found\n");
3159			return 0;
3160		}
3161		mount = (fs_mount*)val;
3162	}
3163
3164	_dump_mount(mount);
3165	return 0;
3166}
3167
3168
3169static int
3170dump_mounts(int argc, char** argv)
3171{
3172	if (argc != 1) {
3173		kprintf("usage: %s\n", argv[0]);
3174		return 0;
3175	}
3176
3177	kprintf("%-*s    id %-*s   %-*s   %-*s   fs_name\n",
3178		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "root",
3179		B_PRINTF_POINTER_WIDTH, "covers", B_PRINTF_POINTER_WIDTH, "cookie");
3180
3181	struct hash_iterator iterator;
3182	struct fs_mount* mount;
3183
3184	hash_open(sMountsTable, &iterator);
3185	while ((mount = (struct fs_mount*)hash_next(sMountsTable, &iterator))
3186			!= NULL) {
3187		kprintf("%p%4" B_PRIdDEV " %p %p %p %s\n", mount, mount->id, mount->root_vnode,
3188			mount->root_vnode->covers, mount->volume->private_volume,
3189			mount->volume->file_system_name);
3190
3191		fs_volume* volume = mount->volume;
3192		while (volume->super_volume != NULL) {
3193			volume = volume->super_volume;
3194			kprintf("                                     %p %s\n",
3195				volume->private_volume, volume->file_system_name);
3196		}
3197	}
3198
3199	hash_close(sMountsTable, &iterator, false);
3200	return 0;
3201}
3202
3203
3204static int
3205dump_vnode(int argc, char** argv)
3206{
3207	bool printPath = false;
3208	int argi = 1;
3209	if (argc >= 2 && strcmp(argv[argi], "-p") == 0) {
3210		printPath = true;
3211		argi++;
3212	}
3213
3214	if (argi >= argc || argi + 2 < argc) {
3215		print_debugger_command_usage(argv[0]);
3216		return 0;
3217	}
3218
3219	struct vnode* vnode = NULL;
3220
3221	if (argi + 1 == argc) {
3222		vnode = (struct vnode*)parse_expression(argv[argi]);
3223		if (IS_USER_ADDRESS(vnode)) {
3224			kprintf("invalid vnode address\n");
3225			return 0;
3226		}
3227		_dump_vnode(vnode, printPath);
3228		return 0;
3229	}
3230
3231	struct hash_iterator iterator;
3232	dev_t device = parse_expression(argv[argi]);
3233	ino_t id = parse_expression(argv[argi + 1]);
3234
3235	hash_open(sVnodeTable, &iterator);
3236	while ((vnode = (struct vnode*)hash_next(sVnodeTable, &iterator)) != NULL) {
3237		if (vnode->id != id || vnode->device != device)
3238			continue;
3239
3240		_dump_vnode(vnode, printPath);
3241	}
3242
3243	hash_close(sVnodeTable, &iterator, false);
3244	return 0;
3245}
3246
3247
3248static int
3249dump_vnodes(int argc, char** argv)
3250{
3251	if (argc != 2 || !strcmp(argv[1], "--help")) {
3252		kprintf("usage: %s [device]\n", argv[0]);
3253		return 0;
3254	}
3255
3256	// restrict dumped nodes to a certain device if requested
3257	dev_t device = parse_expression(argv[1]);
3258
3259	struct hash_iterator iterator;
3260	struct vnode* vnode;
3261
3262	kprintf("%-*s   dev     inode  ref %-*s   %-*s   %-*s   flags\n",
3263		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache",
3264		B_PRINTF_POINTER_WIDTH, "fs-node", B_PRINTF_POINTER_WIDTH, "locking");
3265
3266	hash_open(sVnodeTable, &iterator);
3267	while ((vnode = (struct vnode*)hash_next(sVnodeTable, &iterator)) != NULL) {
3268		if (vnode->device != device)
3269			continue;
3270
3271		kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO "%5" B_PRId32 " %p %p %p %s%s%s\n",
3272			vnode, vnode->device, vnode->id, vnode->ref_count, vnode->cache,
3273			vnode->private_node, vnode->advisory_locking,
3274			vnode->IsRemoved() ? "r" : "-", vnode->IsBusy() ? "b" : "-",
3275			vnode->IsUnpublished() ? "u" : "-");
3276	}
3277
3278	hash_close(sVnodeTable, &iterator, false);
3279	return 0;
3280}
3281
3282
3283static int
3284dump_vnode_caches(int argc, char** argv)
3285{
3286	struct hash_iterator iterator;
3287	struct vnode* vnode;
3288
3289	if (argc > 2 || !strcmp(argv[1], "--help")) {
3290		kprintf("usage: %s [device]\n", argv[0]);
3291		return 0;
3292	}
3293
3294	// restrict dumped nodes to a certain device if requested
3295	dev_t device = -1;
3296	if (argc > 1)
3297		device = parse_expression(argv[1]);
3298
3299	kprintf("%-*s   dev     inode %-*s       size   pages\n",
3300		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache");
3301
3302	hash_open(sVnodeTable, &iterator);
3303	while ((vnode = (struct vnode*)hash_next(sVnodeTable, &iterator)) != NULL) {
3304		if (vnode->cache == NULL)
3305			continue;
3306		if (device != -1 && vnode->device != device)
3307			continue;
3308
3309		kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO " %p %8" B_PRIdOFF "%8" B_PRId32 "\n",
3310			vnode, vnode->device, vnode->id, vnode->cache,
3311			(vnode->cache->virtual_end + B_PAGE_SIZE - 1) / B_PAGE_SIZE,
3312			vnode->cache->page_count);
3313	}
3314
3315	hash_close(sVnodeTable, &iterator, false);
3316	return 0;
3317}
3318
3319
3320int
3321dump_io_context(int argc, char** argv)
3322{
3323	if (argc > 2 || !strcmp(argv[1], "--help")) {
3324		kprintf("usage: %s [team-id|address]\n", argv[0]);
3325		return 0;
3326	}
3327
3328	struct io_context* context = NULL;
3329
3330	if (argc > 1) {
3331		ulong num = parse_expression(argv[1]);
3332		if (IS_KERNEL_ADDRESS(num))
3333			context = (struct io_context*)num;
3334		else {
3335			Team* team = team_get_team_struct_locked(num);
3336			if (team == NULL) {
3337				kprintf("could not find team with ID %lu\n", num);
3338				return 0;
3339			}
3340			context = (struct io_context*)team->io_context;
3341		}
3342	} else
3343		context = get_current_io_context(true);
3344
3345	kprintf("I/O CONTEXT: %p\n", context);
3346	kprintf(" root vnode:\t%p\n", context->root);
3347	kprintf(" cwd vnode:\t%p\n", context->cwd);
3348	kprintf(" used fds:\t%" B_PRIu32 "\n", context->num_used_fds);
3349	kprintf(" max fds:\t%" B_PRIu32 "\n", context->table_size);
3350
3351	if (context->num_used_fds) {
3352		kprintf("   no.  type    %*s  ref  open  mode         pos    %*s\n",
3353			B_PRINTF_POINTER_WIDTH, "ops", B_PRINTF_POINTER_WIDTH, "cookie");
3354	}
3355
3356	for (uint32 i = 0; i < context->table_size; i++) {
3357		struct file_descriptor* fd = context->fds[i];
3358		if (fd == NULL)
3359			continue;
3360
3361		kprintf("  %3" B_PRIu32 ":  %4" B_PRId32 "  %p  %3" B_PRId32 "  %4"
3362			B_PRIu32 "  %4" B_PRIx32 "  %10" B_PRIdOFF "  %p  %s %p\n", i,
3363			fd->type, fd->ops, fd->ref_count, fd->open_count, fd->open_mode,
3364			fd->pos, fd->cookie,
3365			fd->type >= FDTYPE_INDEX && fd->type <= FDTYPE_QUERY
3366				? "mount" : "vnode",
3367			fd->u.vnode);
3368	}
3369
3370	kprintf(" used monitors:\t%" B_PRIu32 "\n", context->num_monitors);
3371	kprintf(" max monitors:\t%" B_PRIu32 "\n", context->max_monitors);
3372
3373	set_debug_variable("_cwd", (addr_t)context->cwd);
3374
3375	return 0;
3376}
3377
3378
3379int
3380dump_vnode_usage(int argc, char** argv)
3381{
3382	if (argc != 1) {
3383		kprintf("usage: %s\n", argv[0]);
3384		return 0;
3385	}
3386
3387	kprintf("Unused vnodes: %" B_PRIu32 " (max unused %" B_PRIu32 ")\n",
3388		sUnusedVnodes, kMaxUnusedVnodes);
3389
3390	struct hash_iterator iterator;
3391	hash_open(sVnodeTable, &iterator);
3392
3393	uint32 count = 0;
3394	struct vnode* vnode;
3395	while ((vnode = (struct vnode*)hash_next(sVnodeTable, &iterator)) != NULL) {
3396		count++;
3397	}
3398
3399	hash_close(sVnodeTable, &iterator, false);
3400
3401	kprintf("%" B_PRIu32 " vnodes total (%" B_PRIu32 " in use).\n", count,
3402		count - sUnusedVnodes);
3403	return 0;
3404}
3405
3406#endif	// ADD_DEBUGGER_COMMANDS
3407
3408/*!	Clears an iovec array of physical pages.
3409	Returns in \a _bytes the number of bytes successfully cleared.
3410*/
3411static status_t
3412zero_pages(const iovec* vecs, size_t vecCount, size_t* _bytes)
3413{
3414	size_t bytes = *_bytes;
3415	size_t index = 0;
3416
3417	while (bytes > 0) {
3418		size_t length = min_c(vecs[index].iov_len, bytes);
3419
3420		status_t status = vm_memset_physical((addr_t)vecs[index].iov_base, 0,
3421			length);
3422		if (status != B_OK) {
3423			*_bytes -= bytes;
3424			return status;
3425		}
3426
3427		bytes -= length;
3428	}
3429
3430	return B_OK;
3431}
3432
3433
3434/*!	Does the dirty work of combining the file_io_vecs with the iovecs
3435	and calls the file system hooks to read/write the request to disk.
3436*/
3437static status_t
3438common_file_io_vec_pages(struct vnode* vnode, void* cookie,
3439	const file_io_vec* fileVecs, size_t fileVecCount, const iovec* vecs,
3440	size_t vecCount, uint32* _vecIndex, size_t* _vecOffset, size_t* _numBytes,
3441	bool doWrite)
3442{
3443	if (fileVecCount == 0) {
3444		// There are no file vecs at this offset, so we're obviously trying
3445		// to access the file outside of its bounds
3446		return B_BAD_VALUE;
3447	}
3448
3449	size_t numBytes = *_numBytes;
3450	uint32 fileVecIndex;
3451	size_t vecOffset = *_vecOffset;
3452	uint32 vecIndex = *_vecIndex;
3453	status_t status;
3454	size_t size;
3455
3456	if (!doWrite && vecOffset == 0) {
3457		// now directly read the data from the device
3458		// the first file_io_vec can be read directly
3459
3460		if (fileVecs[0].length < (off_t)numBytes)
3461			size = fileVecs[0].length;
3462		else
3463			size = numBytes;
3464
3465		if (fileVecs[0].offset >= 0) {
3466			status = FS_CALL(vnode, read_pages, cookie, fileVecs[0].offset,
3467				&vecs[vecIndex], vecCount - vecIndex, &size);
3468		} else {
3469			// sparse read
3470			status = zero_pages(&vecs[vecIndex], vecCount - vecIndex, &size);
3471		}
3472		if (status != B_OK)
3473			return status;
3474
3475		// TODO: this is a work-around for buggy device drivers!
3476		//	When our own drivers honour the length, we can:
3477		//	a) also use this direct I/O for writes (otherwise, it would
3478		//	   overwrite precious data)
3479		//	b) panic if the term below is true (at least for writes)
3480		if ((off_t)size > fileVecs[0].length) {
3481			//dprintf("warning: device driver %p doesn't respect total length "
3482			//	"in read_pages() call!\n", ref->device);
3483			size = fileVecs[0].length;
3484		}
3485
3486		ASSERT((off_t)size <= fileVecs[0].length);
3487
3488		// If the file portion was contiguous, we're already done now
3489		if (size == numBytes)
3490			return B_OK;
3491
3492		// if we reached the end of the file, we can return as well
3493		if ((off_t)size != fileVecs[0].length) {
3494			*_numBytes = size;
3495			return B_OK;
3496		}
3497
3498		fileVecIndex = 1;
3499
3500		// first, find out where we have to continue in our iovecs
3501		for (; vecIndex < vecCount; vecIndex++) {
3502			if (size < vecs[vecIndex].iov_len)
3503				break;
3504
3505			size -= vecs[vecIndex].iov_len;
3506		}
3507
3508		vecOffset = size;
3509	} else {
3510		fileVecIndex = 0;
3511		size = 0;
3512	}
3513
3514	// Too bad, let's process the rest of the file_io_vecs
3515
3516	size_t totalSize = size;
3517	size_t bytesLeft = numBytes - size;
3518
3519	for (; fileVecIndex < fileVecCount; fileVecIndex++) {
3520		const file_io_vec &fileVec = fileVecs[fileVecIndex];
3521		off_t fileOffset = fileVec.offset;
3522		off_t fileLeft = min_c(fileVec.length, bytesLeft);
3523
3524		TRACE(("FILE VEC [%lu] length %Ld\n", fileVecIndex, fileLeft));
3525
3526		// process the complete fileVec
3527		while (fileLeft > 0) {
3528			iovec tempVecs[MAX_TEMP_IO_VECS];
3529			uint32 tempCount = 0;
3530
3531			// size tracks how much of what is left of the current fileVec
3532			// (fileLeft) has been assigned to tempVecs
3533			size = 0;
3534
3535			// assign what is left of the current fileVec to the tempVecs
3536			for (size = 0; (off_t)size < fileLeft && vecIndex < vecCount
3537					&& tempCount < MAX_TEMP_IO_VECS;) {
3538				// try to satisfy one iovec per iteration (or as much as
3539				// possible)
3540
3541				// bytes left of the current iovec
3542				size_t vecLeft = vecs[vecIndex].iov_len - vecOffset;
3543				if (vecLeft == 0) {
3544					vecOffset = 0;
3545					vecIndex++;
3546					continue;
3547				}
3548
3549				TRACE(("fill vec %ld, offset = %lu, size = %lu\n",
3550					vecIndex, vecOffset, size));
3551
3552				// actually available bytes
3553				size_t tempVecSize = min_c(vecLeft, fileLeft - size);
3554
3555				tempVecs[tempCount].iov_base
3556					= (void*)((addr_t)vecs[vecIndex].iov_base + vecOffset);
3557				tempVecs[tempCount].iov_len = tempVecSize;
3558				tempCount++;
3559
3560				size += tempVecSize;
3561				vecOffset += tempVecSize;
3562			}
3563
3564			size_t bytes = size;
3565
3566			if (fileOffset == -1) {
3567				if (doWrite) {
3568					panic("sparse write attempt: vnode %p", vnode);
3569					status = B_IO_ERROR;
3570				} else {
3571					// sparse read
3572					status = zero_pages(tempVecs, tempCount, &bytes);
3573				}
3574			} else if (doWrite) {
3575				status = FS_CALL(vnode, write_pages, cookie, fileOffset,
3576					tempVecs, tempCount, &bytes);
3577			} else {
3578				status = FS_CALL(vnode, read_pages, cookie, fileOffset,
3579					tempVecs, tempCount, &bytes);
3580			}
3581			if (status != B_OK)
3582				return status;
3583
3584			totalSize += bytes;
3585			bytesLeft -= size;
3586			if (fileOffset >= 0)
3587				fileOffset += size;
3588			fileLeft -= size;
3589			//dprintf("-> file left = %Lu\n", fileLeft);
3590
3591			if (size != bytes || vecIndex >= vecCount) {
3592				// there are no more bytes or iovecs, let's bail out
3593				*_numBytes = totalSize;
3594				return B_OK;
3595			}
3596		}
3597	}
3598
3599	*_vecIndex = vecIndex;
3600	*_vecOffset = vecOffset;
3601	*_numBytes = totalSize;
3602	return B_OK;
3603}
3604
3605
3606//	#pragma mark - public API for file systems
3607
3608
3609extern "C" status_t
3610new_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3611	fs_vnode_ops* ops)
3612{
3613	FUNCTION(("new_vnode(volume = %p (%ld), vnodeID = %Ld, node = %p)\n",
3614		volume, volume->id, vnodeID, privateNode));
3615
3616	if (privateNode == NULL)
3617		return B_BAD_VALUE;
3618
3619	// create the node
3620	bool nodeCreated;
3621	struct vnode* vnode;
3622	status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3623		nodeCreated);
3624	if (status != B_OK)
3625		return status;
3626
3627	WriteLocker nodeLocker(sVnodeLock, true);
3628		// create_new_vnode_and_lock() has locked for us
3629
3630	// file system integrity check:
3631	// test if the vnode already exists and bail out if this is the case!
3632	if (!nodeCreated) {
3633		panic("vnode %" B_PRIdDEV ":%" B_PRIdINO " already exists (node = %p, "
3634			"vnode->node = %p)!", volume->id, vnodeID, privateNode,
3635			vnode->private_node);
3636		return B_ERROR;
3637	}
3638
3639	vnode->private_node = privateNode;
3640	vnode->ops = ops;
3641	vnode->SetUnpublished(true);
3642
3643	TRACE(("returns: %s\n", strerror(status)));
3644
3645	return status;
3646}
3647
3648
3649extern "C" status_t
3650publish_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3651	fs_vnode_ops* ops, int type, uint32 flags)
3652{
3653	FUNCTION(("publish_vnode()\n"));
3654
3655	WriteLocker locker(sVnodeLock);
3656
3657	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3658
3659	bool nodeCreated = false;
3660	if (vnode == NULL) {
3661		if (privateNode == NULL)
3662			return B_BAD_VALUE;
3663
3664		// create the node
3665		locker.Unlock();
3666			// create_new_vnode_and_lock() will re-lock for us on success
3667		status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3668			nodeCreated);
3669		if (status != B_OK)
3670			return status;
3671
3672		locker.SetTo(sVnodeLock, true);
3673	}
3674
3675	if (nodeCreated) {
3676		vnode->private_node = privateNode;
3677		vnode->ops = ops;
3678		vnode->SetUnpublished(true);
3679	} else if (vnode->IsBusy() && vnode->IsUnpublished()
3680		&& vnode->private_node == privateNode && vnode->ops == ops) {
3681		// already known, but not published
3682	} else
3683		return B_BAD_VALUE;
3684
3685	bool publishSpecialSubNode = false;
3686
3687	vnode->SetType(type);
3688	vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
3689	publishSpecialSubNode = is_special_node_type(type)
3690		&& (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
3691
3692	status_t status = B_OK;
3693
3694	// create sub vnodes, if necessary
3695	if (volume->sub_volume != NULL || publishSpecialSubNode) {
3696		locker.Unlock();
3697
3698		fs_volume* subVolume = volume;
3699		if (volume->sub_volume != NULL) {
3700			while (status == B_OK && subVolume->sub_volume != NULL) {
3701				subVolume = subVolume->sub_volume;
3702				status = subVolume->ops->create_sub_vnode(subVolume, vnodeID,
3703					vnode);
3704			}
3705		}
3706
3707		if (status == B_OK && publishSpecialSubNode)
3708			status = create_special_sub_node(vnode, flags);
3709
3710		if (status != B_OK) {
3711			// error -- clean up the created sub vnodes
3712			while (subVolume->super_volume != volume) {
3713				subVolume = subVolume->super_volume;
3714				subVolume->ops->delete_sub_vnode(subVolume, vnode);
3715			}
3716		}
3717
3718		if (status == B_OK) {
3719			ReadLocker vnodesReadLocker(sVnodeLock);
3720			AutoLocker<Vnode> nodeLocker(vnode);
3721			vnode->SetBusy(false);
3722			vnode->SetUnpublished(false);
3723		} else {
3724			locker.Lock();
3725			hash_remove(sVnodeTable, vnode);
3726			remove_vnode_from_mount_list(vnode, vnode->mount);
3727			free(vnode);
3728		}
3729	} else {
3730		// we still hold the write lock -- mark the node unbusy and published
3731		vnode->SetBusy(false);
3732		vnode->SetUnpublished(false);
3733	}
3734
3735	TRACE(("returns: %s\n", strerror(status)));
3736
3737	return status;
3738}
3739
3740
3741extern "C" status_t
3742get_vnode(fs_volume* volume, ino_t vnodeID, void** _privateNode)
3743{
3744	struct vnode* vnode;
3745
3746	if (volume == NULL)
3747		return B_BAD_VALUE;
3748
3749	status_t status = get_vnode(volume->id, vnodeID, &vnode, true, true);
3750	if (status != B_OK)
3751		return status;
3752
3753	// If this is a layered FS, we need to get the node cookie for the requested
3754	// layer.
3755	if (HAS_FS_CALL(vnode, get_super_vnode)) {
3756		fs_vnode resolvedNode;
3757		status_t status = FS_CALL(vnode, get_super_vnode, volume,
3758			&resolvedNode);
3759		if (status != B_OK) {
3760			panic("get_vnode(): Failed to get super node for vnode %p, "
3761				"volume: %p", vnode, volume);
3762			put_vnode(vnode);
3763			return status;
3764		}
3765
3766		if (_privateNode != NULL)
3767			*_privateNode = resolvedNode.private_node;
3768	} else if (_privateNode != NULL)
3769		*_privateNode = vnode->private_node;
3770
3771	return B_OK;
3772}
3773
3774
3775extern "C" status_t
3776acquire_vnode(fs_volume* volume, ino_t vnodeID)
3777{
3778	struct vnode* vnode;
3779
3780	rw_lock_read_lock(&sVnodeLock);
3781	vnode = lookup_vnode(volume->id, vnodeID);
3782	rw_lock_read_unlock(&sVnodeLock);
3783
3784	if (vnode == NULL)
3785		return B_BAD_VALUE;
3786
3787	inc_vnode_ref_count(vnode);
3788	return B_OK;
3789}
3790
3791
3792extern "C" status_t
3793put_vnode(fs_volume* volume, ino_t vnodeID)
3794{
3795	struct vnode* vnode;
3796
3797	rw_lock_read_lock(&sVnodeLock);
3798	vnode = lookup_vnode(volume->id, vnodeID);
3799	rw_lock_read_unlock(&sVnodeLock);
3800
3801	if (vnode == NULL)
3802		return B_BAD_VALUE;
3803
3804	dec_vnode_ref_count(vnode, false, true);
3805	return B_OK;
3806}
3807
3808
3809extern "C" status_t
3810remove_vnode(fs_volume* volume, ino_t vnodeID)
3811{
3812	ReadLocker locker(sVnodeLock);
3813
3814	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3815	if (vnode == NULL)
3816		return B_ENTRY_NOT_FOUND;
3817
3818	if (vnode->covered_by != NULL || vnode->covers != NULL) {
3819		// this vnode is in use
3820		return B_BUSY;
3821	}
3822
3823	vnode->Lock();
3824
3825	vnode->SetRemoved(true);
3826	bool removeUnpublished = false;
3827
3828	if (vnode->IsUnpublished()) {
3829		// prepare the vnode for deletion
3830		removeUnpublished = true;
3831		vnode->SetBusy(true);
3832	}
3833
3834	vnode->Unlock();
3835	locker.Unlock();
3836
3837	if (removeUnpublished) {
3838		// If the vnode hasn't been published yet, we delete it here
3839		atomic_add(&vnode->ref_count, -1);
3840		free_vnode(vnode, true);
3841	}
3842
3843	return B_OK;
3844}
3845
3846
3847extern "C" status_t
3848unremove_vnode(fs_volume* volume, ino_t vnodeID)
3849{
3850	struct vnode* vnode;
3851
3852	rw_lock_read_lock(&sVnodeLock);
3853
3854	vnode = lookup_vnode(volume->id, vnodeID);
3855	if (vnode) {
3856		AutoLocker<Vnode> nodeLocker(vnode);
3857		vnode->SetRemoved(false);
3858	}
3859
3860	rw_lock_read_unlock(&sVnodeLock);
3861	return B_OK;
3862}
3863
3864
3865extern "C" status_t
3866get_vnode_removed(fs_volume* volume, ino_t vnodeID, bool* _removed)
3867{
3868	ReadLocker _(sVnodeLock);
3869
3870	if (struct vnode* vnode = lookup_vnode(volume->id, vnodeID)) {
3871		if (_removed != NULL)
3872			*_removed = vnode->IsRemoved();
3873		return B_OK;
3874	}
3875
3876	return B_BAD_VALUE;
3877}
3878
3879
3880extern "C" fs_volume*
3881volume_for_vnode(fs_vnode* _vnode)
3882{
3883	if (_vnode == NULL)
3884		return NULL;
3885
3886	struct vnode* vnode = static_cast<struct vnode*>(_vnode);
3887	return vnode->mount->volume;
3888}
3889
3890
3891#if 0
3892extern "C" status_t
3893read_pages(int fd, off_t pos, const iovec* vecs, size_t count,
3894	size_t* _numBytes)
3895{
3896	struct file_descriptor* descriptor;
3897	struct vnode* vnode;
3898
3899	descriptor = get_fd_and_vnode(fd, &vnode, true);
3900	if (descriptor == NULL)
3901		return B_FILE_ERROR;
3902
3903	status_t status = vfs_read_pages(vnode, descriptor->cookie, pos, vecs,
3904		count, 0, _numBytes);
3905
3906	put_fd(descriptor);
3907	return status;
3908}
3909
3910
3911extern "C" status_t
3912write_pages(int fd, off_t pos, const iovec* vecs, size_t count,
3913	size_t* _numBytes)
3914{
3915	struct file_descriptor* descriptor;
3916	struct vnode* vnode;
3917
3918	descriptor = get_fd_and_vnode(fd, &vnode, true);
3919	if (descriptor == NULL)
3920		return B_FILE_ERROR;
3921
3922	status_t status = vfs_write_pages(vnode, descriptor->cookie, pos, vecs,
3923		count, 0, _numBytes);
3924
3925	put_fd(descriptor);
3926	return status;
3927}
3928#endif
3929
3930
3931extern "C" status_t
3932read_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
3933	const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
3934	size_t* _bytes)
3935{
3936	struct file_descriptor* descriptor;
3937	struct vnode* vnode;
3938
3939	descriptor = get_fd_and_vnode(fd, &vnode, true);
3940	if (descriptor == NULL)
3941		return B_FILE_ERROR;
3942
3943	status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
3944		fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
3945		false);
3946
3947	put_fd(descriptor);
3948	return status;
3949}
3950
3951
3952extern "C" status_t
3953write_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
3954	const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
3955	size_t* _bytes)
3956{
3957	struct file_descriptor* descriptor;
3958	struct vnode* vnode;
3959
3960	descriptor = get_fd_and_vnode(fd, &vnode, true);
3961	if (descriptor == NULL)
3962		return B_FILE_ERROR;
3963
3964	status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
3965		fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
3966		true);
3967
3968	put_fd(descriptor);
3969	return status;
3970}
3971
3972
3973extern "C" status_t
3974entry_cache_add(dev_t mountID, ino_t dirID, const char* name, ino_t nodeID)
3975{
3976	// lookup mount -- the caller is required to make sure that the mount
3977	// won't go away
3978	MutexLocker locker(sMountMutex);
3979	struct fs_mount* mount = find_mount(mountID);
3980	if (mount == NULL)
3981		return B_BAD_VALUE;
3982	locker.Unlock();
3983
3984	return mount->entry_cache.Add(dirID, name, nodeID);
3985}
3986
3987
3988extern "C" status_t
3989entry_cache_remove(dev_t mountID, ino_t dirID, const char* name)
3990{
3991	// lookup mount -- the caller is required to make sure that the mount
3992	// won't go away
3993	MutexLocker locker(sMountMutex);
3994	struct fs_mount* mount = find_mount(mountID);
3995	if (mount == NULL)
3996		return B_BAD_VALUE;
3997	locker.Unlock();
3998
3999	return mount->entry_cache.Remove(dirID, name);
4000}
4001
4002
4003//	#pragma mark - private VFS API
4004//	Functions the VFS exports for other parts of the kernel
4005
4006
4007/*! Acquires another reference to the vnode that has to be released
4008	by calling vfs_put_vnode().
4009*/
4010void
4011vfs_acquire_vnode(struct vnode* vnode)
4012{
4013	inc_vnode_ref_count(vnode);
4014}
4015
4016
4017/*! This is currently called from file_cache_create() only.
4018	It's probably a temporary solution as long as devfs requires that
4019	fs_read_pages()/fs_write_pages() are called with the standard
4020	open cookie and not with a device cookie.
4021	If that's done differently, remove this call; it has no other
4022	purpose.
4023*/
4024extern "C" status_t
4025vfs_get_cookie_from_fd(int fd, void** _cookie)
4026{
4027	struct file_descriptor* descriptor;
4028
4029	descriptor = get_fd(get_current_io_context(true), fd);
4030	if (descriptor == NULL)
4031		return B_FILE_ERROR;
4032
4033	*_cookie = descriptor->cookie;
4034	return B_OK;
4035}
4036
4037
4038extern "C" status_t
4039vfs_get_vnode_from_fd(int fd, bool kernel, struct vnode** vnode)
4040{
4041	*vnode = get_vnode_from_fd(fd, kernel);
4042
4043	if (*vnode == NULL)
4044		return B_FILE_ERROR;
4045
4046	return B_NO_ERROR;
4047}
4048
4049
4050extern "C" status_t
4051vfs_get_vnode_from_path(const char* path, bool kernel, struct vnode** _vnode)
4052{
4053	TRACE(("vfs_get_vnode_from_path: entry. path = '%s', kernel %d\n",
4054		path, kernel));
4055
4056	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
4057	if (pathBuffer.InitCheck() != B_OK)
4058		return B_NO_MEMORY;
4059
4060	char* buffer = pathBuffer.LockBuffer();
4061	strlcpy(buffer, path, pathBuffer.BufferSize());
4062
4063	struct vnode* vnode;
4064	status_t status = path_to_vnode(buffer, true, &vnode, NULL, kernel);
4065	if (status != B_OK)
4066		return status;
4067
4068	*_vnode = vnode;
4069	return B_OK;
4070}
4071
4072
4073extern "C" status_t
4074vfs_get_vnode(dev_t mountID, ino_t vnodeID, bool canWait, struct vnode** _vnode)
4075{
4076	struct vnode* vnode;
4077
4078	status_t status = get_vnode(mountID, vnodeID, &vnode, canWait, false);
4079	if (status != B_OK)
4080		return status;
4081
4082	*_vnode = vnode;
4083	return B_OK;
4084}
4085
4086
4087extern "C" status_t
4088vfs_entry_ref_to_vnode(dev_t mountID, ino_t directoryID,
4089	const char* name, struct vnode** _vnode)
4090{
4091	return entry_ref_to_vnode(mountID, directoryID, name, false, true, _vnode);
4092}
4093
4094
4095extern "C" void
4096vfs_vnode_to_node_ref(struct vnode* vnode, dev_t* _mountID, ino_t* _vnodeID)
4097{
4098	*_mountID = vnode->device;
4099	*_vnodeID = vnode->id;
4100}
4101
4102
4103/*!
4104	Helper function abstracting the process of "converting" a given
4105	vnode-pointer to a fs_vnode-pointer.
4106	Currently only used in bindfs.
4107*/
4108extern "C" fs_vnode*
4109vfs_fsnode_for_vnode(struct vnode* vnode)
4110{
4111	return vnode;
4112}
4113
4114
4115/*!
4116	Calls fs_open() on the given vnode and returns a new
4117	file descriptor for it
4118*/
4119int
4120vfs_open_vnode(struct vnode* vnode, int openMode, bool kernel)
4121{
4122	return open_vnode(vnode, openMode, kernel);
4123}
4124
4125
4126/*!	Looks up a vnode with the given mount and vnode ID.
4127	Must only be used with "in-use" vnodes as it doesn't grab a reference
4128	to the node.
4129	It's currently only be used by file_cache_create().
4130*/
4131extern "C" status_t
4132vfs_lookup_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode)
4133{
4134	rw_lock_read_lock(&sVnodeLock);
4135	struct vnode* vnode = lookup_vnode(mountID, vnodeID);
4136	rw_lock_read_unlock(&sVnodeLock);
4137
4138	if (vnode == NULL)
4139		return B_ERROR;
4140
4141	*_vnode = vnode;
4142	return B_OK;
4143}
4144
4145
4146extern "C" status_t
4147vfs_get_fs_node_from_path(fs_volume* volume, const char* path,
4148	bool traverseLeafLink, bool kernel, void** _node)
4149{
4150	TRACE(("vfs_get_fs_node_from_path(volume = %p, path = \"%s\", kernel %d)\n",
4151		volume, path, kernel));
4152
4153	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
4154	if (pathBuffer.InitCheck() != B_OK)
4155		return B_NO_MEMORY;
4156
4157	fs_mount* mount;
4158	status_t status = get_mount(volume->id, &mount);
4159	if (status != B_OK)
4160		return status;
4161
4162	char* buffer = pathBuffer.LockBuffer();
4163	strlcpy(buffer, path, pathBuffer.BufferSize());
4164
4165	struct vnode* vnode = mount->root_vnode;
4166
4167	if (buffer[0] == '/')
4168		status = path_to_vnode(buffer, traverseLeafLink, &vnode, NULL, kernel);
4169	else {
4170		inc_vnode_ref_count(vnode);
4171			// vnode_path_to_vnode() releases a reference to the starting vnode
4172		status = vnode_path_to_vnode(vnode, buffer, traverseLeafLink, 0,
4173			kernel, &vnode, NULL);
4174	}
4175
4176	put_mount(mount);
4177
4178	if (status != B_OK)
4179		return status;
4180
4181	if (vnode->device != volume->id) {
4182		// wrong mount ID - must not gain access on foreign file system nodes
4183		put_vnode(vnode);
4184		return B_BAD_VALUE;
4185	}
4186
4187	// Use get_vnode() to resolve the cookie for the right layer.
4188	status = get_vnode(volume, vnode->id, _node);
4189	put_vnode(vnode);
4190
4191	return status;
4192}
4193
4194
4195status_t
4196vfs_read_stat(int fd, const char* path, bool traverseLeafLink,
4197	struct stat* stat, bool kernel)
4198{
4199	status_t status;
4200
4201	if (path) {
4202		// path given: get the stat of the node referred to by (fd, path)
4203		KPath pathBuffer(path, false, B_PATH_NAME_LENGTH + 1);
4204		if (pathBuffer.InitCheck() != B_OK)
4205			return B_NO_MEMORY;
4206
4207		status = common_path_read_stat(fd, pathBuffer.LockBuffer(),
4208			traverseLeafLink, stat, kernel);
4209	} else {
4210		// no path given: get the FD and use the FD operation
4211		struct file_descriptor* descriptor
4212			= get_fd(get_current_io_context(kernel), fd);
4213		if (descriptor == NULL)
4214			return B_FILE_ERROR;
4215
4216		if (descriptor->ops->fd_read_stat)
4217			status = descriptor->ops->fd_read_stat(descriptor, stat);
4218		else
4219			status = B_UNSUPPORTED;
4220
4221		put_fd(descriptor);
4222	}
4223
4224	return status;
4225}
4226
4227
4228/*!	Finds the full path to the file that contains the module \a moduleName,
4229	puts it into \a pathBuffer, and returns B_OK for success.
4230	If \a pathBuffer was too small, it returns \c B_BUFFER_OVERFLOW,
4231	\c B_ENTRY_NOT_FOUNT if no file could be found.
4232	\a pathBuffer is clobbered in any case and must not be relied on if this
4233	functions returns unsuccessfully.
4234	\a basePath and \a pathBuffer must not point to the same space.
4235*/
4236status_t
4237vfs_get_module_path(const char* basePath, const char* moduleName,
4238	char* pathBuffer, size_t bufferSize)
4239{
4240	struct vnode* dir;
4241	struct vnode* file;
4242	status_t status;
4243	size_t length;
4244	char* path;
4245
4246	if (bufferSize == 0
4247		|| strlcpy(pathBuffer, basePath, bufferSize) >= bufferSize)
4248		return B_BUFFER_OVERFLOW;
4249
4250	status = path_to_vnode(pathBuffer, true, &dir, NULL, true);
4251	if (status != B_OK)
4252		return status;
4253
4254	// the path buffer had been clobbered by the above call
4255	length = strlcpy(pathBuffer, basePath, bufferSize);
4256	if (pathBuffer[length - 1] != '/')
4257		pathBuffer[length++] = '/';
4258
4259	path = pathBuffer + length;
4260	bufferSize -= length;
4261
4262	while (moduleName) {
4263		char* nextPath = strchr(moduleName, '/');
4264		if (nextPath == NULL)
4265			length = strlen(moduleName);
4266		else {
4267			length = nextPath - moduleName;
4268			nextPath++;
4269		}
4270
4271		if (length + 1 >= bufferSize) {
4272			status = B_BUFFER_OVERFLOW;
4273			goto err;
4274		}
4275
4276		memcpy(path, moduleName, length);
4277		path[length] = '\0';
4278		moduleName = nextPath;
4279
4280		status = vnode_path_to_vnode(dir, path, true, 0, true, &file, NULL);
4281		if (status != B_OK) {
4282			// vnode_path_to_vnode() has already released the reference to dir
4283			return status;
4284		}
4285
4286		if (S_ISDIR(file->Type())) {
4287			// goto the next directory
4288			path[length] = '/';
4289			path[length + 1] = '\0';
4290			path += length + 1;
4291			bufferSize -= length + 1;
4292
4293			dir = file;
4294		} else if (S_ISREG(file->Type())) {
4295			// it's a file so it should be what we've searched for
4296			put_vnode(file);
4297
4298			return B_OK;
4299		} else {
4300			TRACE(("vfs_get_module_path(): something is strange here: "
4301				"0x%08lx...\n", file->Type()));
4302			status = B_ERROR;
4303			dir = file;
4304			goto err;
4305		}
4306	}
4307
4308	// if we got here, the moduleName just pointed to a directory, not to
4309	// a real module - what should we do in this case?
4310	status = B_ENTRY_NOT_FOUND;
4311
4312err:
4313	put_vnode(dir);
4314	return status;
4315}
4316
4317
4318/*!	\brief Normalizes a given path.
4319
4320	The path must refer to an existing or non-existing entry in an existing
4321	directory, that is chopping off the leaf component the remaining path must
4322	refer to an existing directory.
4323
4324	The returned will be canonical in that it will be absolute, will not
4325	contain any "." or ".." components or duplicate occurrences of '/'s,
4326	and none of the directory components will by symbolic links.
4327
4328	Any two paths referring to the same entry, will result in the same
4329	normalized path (well, that is pretty much the definition of `normalized',
4330	isn't it :-).
4331
4332	\param path The path to be normalized.
4333	\param buffer The buffer into which the normalized path will be written.
4334		   May be the same one as \a path.
4335	\param bufferSize The size of \a buffer.
4336	\param traverseLink If \c true, the function also resolves leaf symlinks.
4337	\param kernel \c true, if the IO context of the kernel shall be used,
4338		   otherwise that of the team this thread belongs to. Only relevant,
4339		   if the path is relative (to get the CWD).
4340	\return \c B_OK if everything went fine, another error code otherwise.
4341*/
4342status_t
4343vfs_normalize_path(const char* path, char* buffer, size_t bufferSize,
4344	bool traverseLink, bool kernel)
4345{
4346	if (!path || !buffer || bufferSize < 1)
4347		return B_BAD_VALUE;
4348
4349	if (path != buffer) {
4350		if (strlcpy(buffer, path, bufferSize) >= bufferSize)
4351			return B_BUFFER_OVERFLOW;
4352	}
4353
4354	return normalize_path(buffer, bufferSize, traverseLink, kernel);
4355}
4356
4357
4358/*!	\brief Creates a special node in the file system.
4359
4360	The caller gets a reference to the newly created node (which is passed
4361	back through \a _createdVnode) and is responsible for releasing it.
4362
4363	\param path The path where to create the entry for the node. Can be \c NULL,
4364		in which case the node is created without an entry in the root FS -- it
4365		will automatically be deleted when the last reference has been released.
4366	\param subVnode The definition of the subnode. Can be \c NULL, in which case
4367		the target file system will just create the node with its standard
4368		operations. Depending on the type of the node a subnode might be created
4369		automatically, though.
4370	\param mode The type and permissions for the node to be created.
4371	\param flags Flags to be passed to the creating FS.
4372	\param kernel \c true, if called in the kernel context (relevant only if
4373		\a path is not \c NULL and not absolute).
4374	\param _superVnode Pointer to a pre-allocated structure to be filled by the
4375		file system creating the node, with the private data pointer and
4376		operations for the super node. Can be \c NULL.
4377	\param _createVnode Pointer to pre-allocated storage where to store the
4378		pointer to the newly created node.
4379	\return \c B_OK, if everything went fine, another error code otherwise.
4380*/
4381status_t
4382vfs_create_special_node(const char* path, fs_vnode* subVnode, mode_t mode,
4383	uint32 flags, bool kernel, fs_vnode* _superVnode,
4384	struct vnode** _createdVnode)
4385{
4386	struct vnode* dirNode;
4387	char _leaf[B_FILE_NAME_LENGTH];
4388	char* leaf = NULL;
4389
4390	if (path) {
4391		// We've got a path. Get the dir vnode and the leaf name.
4392		KPath tmpPathBuffer(B_PATH_NAME_LENGTH + 1);
4393		if (tmpPathBuffer.InitCheck() != B_OK)
4394			return B_NO_MEMORY;
4395
4396		char* tmpPath = tmpPathBuffer.LockBuffer();
4397		if (strlcpy(tmpPath, path, B_PATH_NAME_LENGTH) >= B_PATH_NAME_LENGTH)
4398			return B_NAME_TOO_LONG;
4399
4400		// get the dir vnode and the leaf name
4401		leaf = _leaf;
4402		status_t error = path_to_dir_vnode(tmpPath, &dirNode, leaf, kernel);
4403		if (error != B_OK)
4404			return error;
4405	} else {
4406		// No path. Create the node in the root FS.
4407		dirNode = sRoot;
4408		inc_vnode_ref_count(dirNode);
4409	}
4410
4411	VNodePutter _(dirNode);
4412
4413	// check support for creating special nodes
4414	if (!HAS_FS_CALL(dirNode, create_special_node))
4415		return B_UNSUPPORTED;
4416
4417	// create the node
4418	fs_vnode superVnode;
4419	ino_t nodeID;
4420	status_t status = FS_CALL(dirNode, create_special_node, leaf, subVnode,
4421		mode, flags, _superVnode != NULL ? _superVnode : &superVnode, &nodeID);
4422	if (status != B_OK)
4423		return status;
4424
4425	// lookup the node
4426	rw_lock_read_lock(&sVnodeLock);
4427	*_createdVnode = lookup_vnode(dirNode->mount->id, nodeID);
4428	rw_lock_read_unlock(&sVnodeLock);
4429
4430	if (*_createdVnode == NULL) {
4431		panic("vfs_create_special_node(): lookup of node failed");
4432		return B_ERROR;
4433	}
4434
4435	return B_OK;
4436}
4437
4438
4439extern "C" void
4440vfs_put_vnode(struct vnode* vnode)
4441{
4442	put_vnode(vnode);
4443}
4444
4445
4446extern "C" status_t
4447vfs_get_cwd(dev_t* _mountID, ino_t* _vnodeID)
4448{
4449	// Get current working directory from io context
4450	struct io_context* context = get_current_io_context(false);
4451	status_t status = B_OK;
4452
4453	mutex_lock(&context->io_mutex);
4454
4455	if (context->cwd != NULL) {
4456		*_mountID = context->cwd->device;
4457		*_vnodeID = context->cwd->id;
4458	} else
4459		status = B_ERROR;
4460
4461	mutex_unlock(&context->io_mutex);
4462	return status;
4463}
4464
4465
4466status_t
4467vfs_unmount(dev_t mountID, uint32 flags)
4468{
4469	return fs_unmount(NULL, mountID, flags, true);
4470}
4471
4472
4473extern "C" status_t
4474vfs_disconnect_vnode(dev_t mountID, ino_t vnodeID)
4475{
4476	struct vnode* vnode;
4477
4478	status_t status = get_vnode(mountID, vnodeID, &vnode, true, true);
4479	if (status != B_OK)
4480		return status;
4481
4482	disconnect_mount_or_vnode_fds(vnode->mount, vnode);
4483	put_vnode(vnode);
4484	return B_OK;
4485}
4486
4487
4488extern "C" void
4489vfs_free_unused_vnodes(int32 level)
4490{
4491	vnode_low_resource_handler(NULL,
4492		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
4493			| B_KERNEL_RESOURCE_ADDRESS_SPACE,
4494		level);
4495}
4496
4497
4498extern "C" bool
4499vfs_can_page(struct vnode* vnode, void* cookie)
4500{
4501	FUNCTION(("vfs_canpage: vnode 0x%p\n", vnode));
4502
4503	if (HAS_FS_CALL(vnode, can_page))
4504		return FS_CALL(vnode, can_page, cookie);
4505	return false;
4506}
4507
4508
4509extern "C" status_t
4510vfs_read_pages(struct vnode* vnode, void* cookie, off_t pos,
4511	const generic_io_vec* vecs, size_t count, uint32 flags,
4512	generic_size_t* _numBytes)
4513{
4514	FUNCTION(("vfs_read_pages: vnode %p, vecs %p, pos %Ld\n", vnode, vecs,
4515		pos));
4516
4517#if VFS_PAGES_IO_TRACING
4518	generic_size_t bytesRequested = *_numBytes;
4519#endif
4520
4521	IORequest request;
4522	status_t status = request.Init(pos, vecs, count, *_numBytes, false, flags);
4523	if (status == B_OK) {
4524		status = vfs_vnode_io(vnode, cookie, &request);
4525		if (status == B_OK)
4526			status = request.Wait();
4527		*_numBytes = request.TransferredBytes();
4528	}
4529
4530	TPIO(ReadPages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4531		status, *_numBytes));
4532
4533	return status;
4534}
4535
4536
4537extern "C" status_t
4538vfs_write_pages(struct vnode* vnode, void* cookie, off_t pos,
4539	const generic_io_vec* vecs, size_t count, uint32 flags,
4540	generic_size_t* _numBytes)
4541{
4542	FUNCTION(("vfs_write_pages: vnode %p, vecs %p, pos %Ld\n", vnode, vecs,
4543		pos));
4544
4545#if VFS_PAGES_IO_TRACING
4546	generic_size_t bytesRequested = *_numBytes;
4547#endif
4548
4549	IORequest request;
4550	status_t status = request.Init(pos, vecs, count, *_numBytes, true, flags);
4551	if (status == B_OK) {
4552		status = vfs_vnode_io(vnode, cookie, &request);
4553		if (status == B_OK)
4554			status = request.Wait();
4555		*_numBytes = request.TransferredBytes();
4556	}
4557
4558	TPIO(WritePages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4559		status, *_numBytes));
4560
4561	return status;
4562}
4563
4564
4565/*!	Gets the vnode's VMCache object. If it didn't have one, it will be
4566	created if \a allocate is \c true.
4567	In case it's successful, it will also grab a reference to the cache
4568	it returns.
4569*/
4570extern "C" status_t
4571vfs_get_vnode_cache(struct vnode* vnode, VMCache** _cache, bool allocate)
4572{
4573	if (vnode->cache != NULL) {
4574		vnode->cache->AcquireRef();
4575		*_cache = vnode->cache;
4576		return B_OK;
4577	}
4578
4579	rw_lock_read_lock(&sVnodeLock);
4580	vnode->Lock();
4581
4582	status_t status = B_OK;
4583
4584	// The cache could have been created in the meantime
4585	if (vnode->cache == NULL) {
4586		if (allocate) {
4587			// TODO: actually the vnode needs to be busy already here, or
4588			//	else this won't work...
4589			bool wasBusy = vnode->IsBusy();
4590			vnode->SetBusy(true);
4591
4592			vnode->Unlock();
4593			rw_lock_read_unlock(&sVnodeLock);
4594
4595			status = vm_create_vnode_cache(vnode, &vnode->cache);
4596
4597			rw_lock_read_lock(&sVnodeLock);
4598			vnode->Lock();
4599			vnode->SetBusy(wasBusy);
4600		} else
4601			status = B_BAD_VALUE;
4602	}
4603
4604	vnode->Unlock();
4605	rw_lock_read_unlock(&sVnodeLock);
4606
4607	if (status == B_OK) {
4608		vnode->cache->AcquireRef();
4609		*_cache = vnode->cache;
4610	}
4611
4612	return status;
4613}
4614
4615
4616status_t
4617vfs_get_file_map(struct vnode* vnode, off_t offset, size_t size,
4618	file_io_vec* vecs, size_t* _count)
4619{
4620	FUNCTION(("vfs_get_file_map: vnode %p, vecs %p, offset %" B_PRIdOFF
4621		", size = %" B_PRIuSIZE "\n", vnode, vecs, offset, size));
4622
4623	return FS_CALL(vnode, get_file_map, offset, size, vecs, _count);
4624}
4625
4626
4627status_t
4628vfs_stat_vnode(struct vnode* vnode, struct stat* stat)
4629{
4630	status_t status = FS_CALL(vnode, read_stat, stat);
4631
4632	// fill in the st_dev and st_ino fields
4633	if (status == B_OK) {
4634		stat->st_dev = vnode->device;
4635		stat->st_ino = vnode->id;
4636		stat->st_rdev = -1;
4637	}
4638
4639	return status;
4640}
4641
4642
4643status_t
4644vfs_stat_node_ref(dev_t device, ino_t inode, struct stat* stat)
4645{
4646	struct vnode* vnode;
4647	status_t status = get_vnode(device, inode, &vnode, true, false);
4648	if (status != B_OK)
4649		return status;
4650
4651	status = FS_CALL(vnode, read_stat, stat);
4652
4653	// fill in the st_dev and st_ino fields
4654	if (status == B_OK) {
4655		stat->st_dev = vnode->device;
4656		stat->st_ino = vnode->id;
4657		stat->st_rdev = -1;
4658	}
4659
4660	put_vnode(vnode);
4661	return status;
4662}
4663
4664
4665status_t
4666vfs_get_vnode_name(struct vnode* vnode, char* name, size_t nameSize)
4667{
4668	return get_vnode_name(vnode, NULL, name, nameSize, true);
4669}
4670
4671
4672status_t
4673vfs_entry_ref_to_path(dev_t device, ino_t inode, const char* leaf,
4674	char* path, size_t pathLength)
4675{
4676	struct vnode* vnode;
4677	status_t status;
4678
4679	// filter invalid leaf names
4680	if (leaf != NULL && (leaf[0] == '\0' || strchr(leaf, '/')))
4681		return B_BAD_VALUE;
4682
4683	// get the vnode matching the dir's node_ref
4684	if (leaf && (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0)) {
4685		// special cases "." and "..": we can directly get the vnode of the
4686		// referenced directory
4687		status = entry_ref_to_vnode(device, inode, leaf, false, true, &vnode);
4688		leaf = NULL;
4689	} else
4690		status = get_vnode(device, inode, &vnode, true, false);
4691	if (status != B_OK)
4692		return status;
4693
4694	// get the directory path
4695	status = dir_vnode_to_path(vnode, path, pathLength, true);
4696	put_vnode(vnode);
4697		// we don't need the vnode anymore
4698	if (status != B_OK)
4699		return status;
4700
4701	// append the leaf name
4702	if (leaf) {
4703		// insert a directory separator if this is not the file system root
4704		if ((strcmp(path, "/") && strlcat(path, "/", pathLength)
4705				>= pathLength)
4706			|| strlcat(path, leaf, pathLength) >= pathLength) {
4707			return B_NAME_TOO_LONG;
4708		}
4709	}
4710
4711	return B_OK;
4712}
4713
4714
4715/*!	If the given descriptor locked its vnode, that lock will be released. */
4716void
4717vfs_unlock_vnode_if_locked(struct file_descriptor* descriptor)
4718{
4719	struct vnode* vnode = fd_vnode(descriptor);
4720
4721	if (vnode != NULL && vnode->mandatory_locked_by == descriptor)
4722		vnode->mandatory_locked_by = NULL;
4723}
4724
4725
4726/*!	Closes all file descriptors of the specified I/O context that
4727	have the O_CLOEXEC flag set.
4728*/
4729void
4730vfs_exec_io_context(io_context* context)
4731{
4732	uint32 i;
4733
4734	for (i = 0; i < context->table_size; i++) {
4735		mutex_lock(&context->io_mutex);
4736
4737		struct file_descriptor* descriptor = context->fds[i];
4738		bool remove = false;
4739
4740		if (descriptor != NULL && fd_close_on_exec(context, i)) {
4741			context->fds[i] = NULL;
4742			context->num_used_fds--;
4743
4744			remove = true;
4745		}
4746
4747		mutex_unlock(&context->io_mutex);
4748
4749		if (remove) {
4750			close_fd(descriptor);
4751			put_fd(descriptor);
4752		}
4753	}
4754}
4755
4756
4757/*! Sets up a new io_control structure, and inherits the properties
4758	of the parent io_control if it is given.
4759*/
4760io_context*
4761vfs_new_io_context(io_context* parentContext, bool purgeCloseOnExec)
4762{
4763	io_context* context = (io_context*)malloc(sizeof(io_context));
4764	if (context == NULL)
4765		return NULL;
4766
4767	TIOC(NewIOContext(context, parentContext));
4768
4769	memset(context, 0, sizeof(io_context));
4770	context->ref_count = 1;
4771
4772	MutexLocker parentLocker;
4773
4774	size_t tableSize;
4775	if (parentContext) {
4776		parentLocker.SetTo(parentContext->io_mutex, false);
4777		tableSize = parentContext->table_size;
4778	} else
4779		tableSize = DEFAULT_FD_TABLE_SIZE;
4780
4781	// allocate space for FDs and their close-on-exec flag
4782	context->fds = (file_descriptor**)malloc(
4783		sizeof(struct file_descriptor*) * tableSize
4784		+ sizeof(struct select_sync*) * tableSize
4785		+ (tableSize + 7) / 8);
4786	if (context->fds == NULL) {
4787		free(context);
4788		return NULL;
4789	}
4790
4791	context->select_infos = (select_info**)(context->fds + tableSize);
4792	context->fds_close_on_exec = (uint8*)(context->select_infos + tableSize);
4793
4794	memset(context->fds, 0, sizeof(struct file_descriptor*) * tableSize
4795		+ sizeof(struct select_sync*) * tableSize
4796		+ (tableSize + 7) / 8);
4797
4798	mutex_init(&context->io_mutex, "I/O context");
4799
4800	// Copy all parent file descriptors
4801
4802	if (parentContext) {
4803		size_t i;
4804
4805		mutex_lock(&sIOContextRootLock);
4806		context->root = parentContext->root;
4807		if (context->root)
4808			inc_vnode_ref_count(context->root);
4809		mutex_unlock(&sIOContextRootLock);
4810
4811		context->cwd = parentContext->cwd;
4812		if (context->cwd)
4813			inc_vnode_ref_count(context->cwd);
4814
4815		for (i = 0; i < tableSize; i++) {
4816			struct file_descriptor* descriptor = parentContext->fds[i];
4817
4818			if (descriptor != NULL) {
4819				bool closeOnExec = fd_close_on_exec(parentContext, i);
4820				if (closeOnExec && purgeCloseOnExec)
4821					continue;
4822
4823				TFD(InheritFD(context, i, descriptor, parentContext));
4824
4825				context->fds[i] = descriptor;
4826				context->num_used_fds++;
4827				atomic_add(&descriptor->ref_count, 1);
4828				atomic_add(&descriptor->open_count, 1);
4829
4830				if (closeOnExec)
4831					fd_set_close_on_exec(context, i, true);
4832			}
4833		}
4834
4835		parentLocker.Unlock();
4836	} else {
4837		context->root = sRoot;
4838		context->cwd = sRoot;
4839
4840		if (context->root)
4841			inc_vnode_ref_count(context->root);
4842
4843		if (context->cwd)
4844			inc_vnode_ref_count(context->cwd);
4845	}
4846
4847	context->table_size = tableSize;
4848
4849	list_init(&context->node_monitors);
4850	context->max_monitors = DEFAULT_NODE_MONITORS;
4851
4852	return context;
4853}
4854
4855
4856static status_t
4857vfs_free_io_context(io_context* context)
4858{
4859	uint32 i;
4860
4861	TIOC(FreeIOContext(context));
4862
4863	if (context->root)
4864		put_vnode(context->root);
4865
4866	if (context->cwd)
4867		put_vnode(context->cwd);
4868
4869	mutex_lock(&context->io_mutex);
4870
4871	for (i = 0; i < context->table_size; i++) {
4872		if (struct file_descriptor* descriptor = context->fds[i]) {
4873			close_fd(descriptor);
4874			put_fd(descriptor);
4875		}
4876	}
4877
4878	mutex_destroy(&context->io_mutex);
4879
4880	remove_node_monitors(context);
4881	free(context->fds);
4882	free(context);
4883
4884	return B_OK;
4885}
4886
4887
4888void
4889vfs_get_io_context(io_context* context)
4890{
4891	atomic_add(&context->ref_count, 1);
4892}
4893
4894
4895void
4896vfs_put_io_context(io_context* context)
4897{
4898	if (atomic_add(&context->ref_count, -1) == 1)
4899		vfs_free_io_context(context);
4900}
4901
4902
4903static status_t
4904vfs_resize_fd_table(struct io_context* context, const int newSize)
4905{
4906	if (newSize <= 0 || newSize > MAX_FD_TABLE_SIZE)
4907		return B_BAD_VALUE;
4908
4909	TIOC(ResizeIOContext(context, newSize));
4910
4911	MutexLocker _(context->io_mutex);
4912
4913	int oldSize = context->table_size;
4914	int oldCloseOnExitBitmapSize = (oldSize + 7) / 8;
4915	int newCloseOnExitBitmapSize = (newSize + 7) / 8;
4916
4917	// If the tables shrink, make sure none of the fds being dropped are in use.
4918	if (newSize < oldSize) {
4919		for (int i = oldSize; i-- > newSize;) {
4920			if (context->fds[i])
4921				return B_BUSY;
4922		}
4923	}
4924
4925	// store pointers to the old tables
4926	file_descriptor** oldFDs = context->fds;
4927	select_info** oldSelectInfos = context->select_infos;
4928	uint8* oldCloseOnExecTable = context->fds_close_on_exec;
4929
4930	// allocate new tables
4931	file_descriptor** newFDs = (file_descriptor**)malloc(
4932		sizeof(struct file_descriptor*) * newSize
4933		+ sizeof(struct select_sync*) * newSize
4934		+ newCloseOnExitBitmapSize);
4935	if (newFDs == NULL)
4936		return B_NO_MEMORY;
4937
4938	context->fds = newFDs;
4939	context->select_infos = (select_info**)(context->fds + newSize);
4940	context->fds_close_on_exec = (uint8*)(context->select_infos + newSize);
4941	context->table_size = newSize;
4942
4943	// copy entries from old tables
4944	int toCopy = min_c(oldSize, newSize);
4945
4946	memcpy(context->fds, oldFDs, sizeof(void*) * toCopy);
4947	memcpy(context->select_infos, oldSelectInfos, sizeof(void*) * toCopy);
4948	memcpy(context->fds_close_on_exec, oldCloseOnExecTable,
4949		min_c(oldCloseOnExitBitmapSize, newCloseOnExitBitmapSize));
4950
4951	// clear additional entries, if the tables grow
4952	if (newSize > oldSize) {
4953		memset(context->fds + oldSize, 0, sizeof(void*) * (newSize - oldSize));
4954		memset(context->select_infos + oldSize, 0,
4955			sizeof(void*) * (newSize - oldSize));
4956		memset(context->fds_close_on_exec + oldCloseOnExitBitmapSize, 0,
4957			newCloseOnExitBitmapSize - oldCloseOnExitBitmapSize);
4958	}
4959
4960	free(oldFDs);
4961
4962	return B_OK;
4963}
4964
4965
4966static status_t
4967vfs_resize_monitor_table(struct io_context* context, const int newSize)
4968{
4969	int	status = B_OK;
4970
4971	if (newSize <= 0 || newSize > MAX_NODE_MONITORS)
4972		return B_BAD_VALUE;
4973
4974	mutex_lock(&context->io_mutex);
4975
4976	if ((size_t)newSize < context->num_monitors) {
4977		status = B_BUSY;
4978		goto out;
4979	}
4980	context->max_monitors = newSize;
4981
4982out:
4983	mutex_unlock(&context->io_mutex);
4984	return status;
4985}
4986
4987
4988status_t
4989vfs_get_mount_point(dev_t mountID, dev_t* _mountPointMountID,
4990	ino_t* _mountPointNodeID)
4991{
4992	ReadLocker nodeLocker(sVnodeLock);
4993	MutexLocker mountLocker(sMountMutex);
4994
4995	struct fs_mount* mount = find_mount(mountID);
4996	if (mount == NULL)
4997		return B_BAD_VALUE;
4998
4999	Vnode* mountPoint = mount->covers_vnode;
5000
5001	*_mountPointMountID = mountPoint->device;
5002	*_mountPointNodeID = mountPoint->id;
5003
5004	return B_OK;
5005}
5006
5007
5008status_t
5009vfs_bind_mount_directory(dev_t mountID, ino_t nodeID, dev_t coveredMountID,
5010	ino_t coveredNodeID)
5011{
5012	// get the vnodes
5013	Vnode* vnode;
5014	status_t error = get_vnode(mountID, nodeID, &vnode, true, false);
5015	if (error != B_OK)
5016		return B_BAD_VALUE;
5017	VNodePutter vnodePutter(vnode);
5018
5019	Vnode* coveredVnode;
5020	error = get_vnode(coveredMountID, coveredNodeID, &coveredVnode, true,
5021		false);
5022	if (error != B_OK)
5023		return B_BAD_VALUE;
5024	VNodePutter coveredVnodePutter(coveredVnode);
5025
5026	// establish the covered/covering links
5027	WriteLocker locker(sVnodeLock);
5028
5029	if (vnode->covers != NULL || coveredVnode->covered_by != NULL
5030		|| vnode->mount->unmounting || coveredVnode->mount->unmounting) {
5031		return B_BUSY;
5032	}
5033
5034	vnode->covers = coveredVnode;
5035	vnode->SetCovering(true);
5036
5037	coveredVnode->covered_by = vnode;
5038	coveredVnode->SetCovered(true);
5039
5040	// the vnodes do now reference each other
5041	inc_vnode_ref_count(vnode);
5042	inc_vnode_ref_count(coveredVnode);
5043
5044	return B_OK;
5045}
5046
5047
5048int
5049vfs_getrlimit(int resource, struct rlimit* rlp)
5050{
5051	if (!rlp)
5052		return B_BAD_ADDRESS;
5053
5054	switch (resource) {
5055		case RLIMIT_NOFILE:
5056		{
5057			struct io_context* context = get_current_io_context(false);
5058			MutexLocker _(context->io_mutex);
5059
5060			rlp->rlim_cur = context->table_size;
5061			rlp->rlim_max = MAX_FD_TABLE_SIZE;
5062			return 0;
5063		}
5064
5065		case RLIMIT_NOVMON:
5066		{
5067			struct io_context* context = get_current_io_context(false);
5068			MutexLocker _(context->io_mutex);
5069
5070			rlp->rlim_cur = context->max_monitors;
5071			rlp->rlim_max = MAX_NODE_MONITORS;
5072			return 0;
5073		}
5074
5075		default:
5076			return B_BAD_VALUE;
5077	}
5078}
5079
5080
5081int
5082vfs_setrlimit(int resource, const struct rlimit* rlp)
5083{
5084	if (!rlp)
5085		return B_BAD_ADDRESS;
5086
5087	switch (resource) {
5088		case RLIMIT_NOFILE:
5089			/* TODO: check getuid() */
5090			if (rlp->rlim_max != RLIM_SAVED_MAX
5091				&& rlp->rlim_max != MAX_FD_TABLE_SIZE)
5092				return B_NOT_ALLOWED;
5093
5094			return vfs_resize_fd_table(get_current_io_context(false),
5095				rlp->rlim_cur);
5096
5097		case RLIMIT_NOVMON:
5098			/* TODO: check getuid() */
5099			if (rlp->rlim_max != RLIM_SAVED_MAX
5100				&& rlp->rlim_max != MAX_NODE_MONITORS)
5101				return B_NOT_ALLOWED;
5102
5103			return vfs_resize_monitor_table(get_current_io_context(false),
5104				rlp->rlim_cur);
5105
5106		default:
5107			return B_BAD_VALUE;
5108	}
5109}
5110
5111
5112status_t
5113vfs_init(kernel_args* args)
5114{
5115	vnode::StaticInit();
5116
5117	struct vnode dummyVnode;
5118	sVnodeTable = hash_init(VNODE_HASH_TABLE_SIZE,
5119		offset_of_member(dummyVnode, next), &vnode_compare, &vnode_hash);
5120	if (sVnodeTable == NULL)
5121		panic("vfs_init: error creating vnode hash table\n");
5122
5123	list_init_etc(&sUnusedVnodeList, offset_of_member(dummyVnode, unused_link));
5124
5125	struct fs_mount dummyMount;
5126	sMountsTable = hash_init(MOUNTS_HASH_TABLE_SIZE,
5127		offset_of_member(dummyMount, next), &mount_compare, &mount_hash);
5128	if (sMountsTable == NULL)
5129		panic("vfs_init: error creating mounts hash table\n");
5130
5131	node_monitor_init();
5132
5133	sRoot = NULL;
5134
5135	recursive_lock_init(&sMountOpLock, "vfs_mount_op_lock");
5136
5137	if (block_cache_init() != B_OK)
5138		return B_ERROR;
5139
5140#ifdef ADD_DEBUGGER_COMMANDS
5141	// add some debugger commands
5142	add_debugger_command_etc("vnode", &dump_vnode,
5143		"Print info about the specified vnode",
5144		"[ \"-p\" ] ( <vnode> | <devID> <nodeID> )\n"
5145		"Prints information about the vnode specified by address <vnode> or\n"
5146		"<devID>, <vnodeID> pair. If \"-p\" is given, a path of the vnode is\n"
5147		"constructed and printed. It might not be possible to construct a\n"
5148		"complete path, though.\n",
5149		0);
5150	add_debugger_command("vnodes", &dump_vnodes,
5151		"list all vnodes (from the specified device)");
5152	add_debugger_command("vnode_caches", &dump_vnode_caches,
5153		"list all vnode caches");
5154	add_debugger_command("mount", &dump_mount,
5155		"info about the specified fs_mount");
5156	add_debugger_command("mounts", &dump_mounts, "list all fs_mounts");
5157	add_debugger_command("io_context", &dump_io_context,
5158		"info about the I/O context");
5159	add_debugger_command("vnode_usage", &dump_vnode_usage,
5160		"info about vnode usage");
5161#endif
5162
5163	register_low_resource_handler(&vnode_low_resource_handler, NULL,
5164		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
5165			| B_KERNEL_RESOURCE_ADDRESS_SPACE,
5166		0);
5167
5168	file_map_init();
5169
5170	return file_cache_init();
5171}
5172
5173
5174//	#pragma mark - fd_ops implementations
5175
5176
5177/*!
5178	Calls fs_open() on the given vnode and returns a new
5179	file descriptor for it
5180*/
5181static int
5182open_vnode(struct vnode* vnode, int openMode, bool kernel)
5183{
5184	void* cookie;
5185	status_t status = FS_CALL(vnode, open, openMode, &cookie);
5186	if (status != B_OK)
5187		return status;
5188
5189	int fd = get_new_fd(FDTYPE_FILE, NULL, vnode, cookie, openMode, kernel);
5190	if (fd < 0) {
5191		FS_CALL(vnode, close, cookie);
5192		FS_CALL(vnode, free_cookie, cookie);
5193	}
5194	return fd;
5195}
5196
5197
5198/*!
5199	Calls fs_open() on the given vnode and returns a new
5200	file descriptor for it
5201*/
5202static int
5203create_vnode(struct vnode* directory, const char* name, int openMode,
5204	int perms, bool kernel)
5205{
5206	bool traverse = ((openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0);
5207	status_t status = B_ERROR;
5208	struct vnode* vnode;
5209	void* cookie;
5210	ino_t newID;
5211
5212	// This is somewhat tricky: If the entry already exists, the FS responsible
5213	// for the directory might not necessarily also be the one responsible for
5214	// the node the entry refers to (e.g. in case of mount points or FIFOs). So
5215	// we can actually never call the create() hook without O_EXCL. Instead we
5216	// try to look the entry up first. If it already exists, we just open the
5217	// node (unless O_EXCL), otherwise we call create() with O_EXCL. This
5218	// introduces a race condition, since someone else might have created the
5219	// entry in the meantime. We hope the respective FS returns the correct
5220	// error code and retry (up to 3 times) again.
5221
5222	for (int i = 0; i < 3 && status != B_OK; i++) {
5223		// look the node up
5224		status = lookup_dir_entry(directory, name, &vnode);
5225		if (status == B_OK) {
5226			VNodePutter putter(vnode);
5227
5228			if ((openMode & O_EXCL) != 0)
5229				return B_FILE_EXISTS;
5230
5231			// If the node is a symlink, we have to follow it, unless
5232			// O_NOTRAVERSE is set.
5233			if (S_ISLNK(vnode->Type()) && traverse) {
5234				putter.Put();
5235				char clonedName[B_FILE_NAME_LENGTH + 1];
5236				if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH)
5237						>= B_FILE_NAME_LENGTH) {
5238					return B_NAME_TOO_LONG;
5239				}
5240
5241				inc_vnode_ref_count(directory);
5242				status = vnode_path_to_vnode(directory, clonedName, true, 0,
5243					kernel, &vnode, NULL);
5244				if (status != B_OK)
5245					return status;
5246
5247				putter.SetTo(vnode);
5248			}
5249
5250			if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
5251				put_vnode(vnode);
5252				return B_LINK_LIMIT;
5253			}
5254
5255			int fd = open_vnode(vnode, openMode & ~O_CREAT, kernel);
5256			// on success keep the vnode reference for the FD
5257			if (fd >= 0)
5258				putter.Detach();
5259
5260			return fd;
5261		}
5262
5263		// it doesn't exist yet -- try to create it
5264
5265		if (!HAS_FS_CALL(directory, create))
5266			return B_READ_ONLY_DEVICE;
5267
5268		status = FS_CALL(directory, create, name, openMode | O_EXCL, perms,
5269			&cookie, &newID);
5270		if (status != B_OK
5271			&& ((openMode & O_EXCL) != 0 || status != B_FILE_EXISTS)) {
5272			return status;
5273		}
5274	}
5275
5276	if (status != B_OK)
5277		return status;
5278
5279	// the node has been created successfully
5280
5281	rw_lock_read_lock(&sVnodeLock);
5282	vnode = lookup_vnode(directory->device, newID);
5283	rw_lock_read_unlock(&sVnodeLock);
5284
5285	if (vnode == NULL) {
5286		panic("vfs: fs_create() returned success but there is no vnode, "
5287			"mount ID %" B_PRIdDEV "!\n", directory->device);
5288		return B_BAD_VALUE;
5289	}
5290
5291	int fd = get_new_fd(FDTYPE_FILE, NULL, vnode, cookie, openMode, kernel);
5292	if (fd >= 0)
5293		return fd;
5294
5295	status = fd;
5296
5297	// something went wrong, clean up
5298
5299	FS_CALL(vnode, close, cookie);
5300	FS_CALL(vnode, free_cookie, cookie);
5301	put_vnode(vnode);
5302
5303	FS_CALL(directory, unlink, name);
5304
5305	return status;
5306}
5307
5308
5309/*! Calls fs open_dir() on the given vnode and returns a new
5310	file descriptor for it
5311*/
5312static int
5313open_dir_vnode(struct vnode* vnode, bool kernel)
5314{
5315	void* cookie;
5316	status_t status = FS_CALL(vnode, open_dir, &cookie);
5317	if (status != B_OK)
5318		return status;
5319
5320	// directory is opened, create a fd
5321	status = get_new_fd(FDTYPE_DIR, NULL, vnode, cookie, O_CLOEXEC, kernel);
5322	if (status >= 0)
5323		return status;
5324
5325	FS_CALL(vnode, close_dir, cookie);
5326	FS_CALL(vnode, free_dir_cookie, cookie);
5327
5328	return status;
5329}
5330
5331
5332/*! Calls fs open_attr_dir() on the given vnode and returns a new
5333	file descriptor for it.
5334	Used by attr_dir_open(), and attr_dir_open_fd().
5335*/
5336static int
5337open_attr_dir_vnode(struct vnode* vnode, bool kernel)
5338{
5339	if (!HAS_FS_CALL(vnode, open_attr_dir))
5340		return B_UNSUPPORTED;
5341
5342	void* cookie;
5343	status_t status = FS_CALL(vnode, open_attr_dir, &cookie);
5344	if (status != B_OK)
5345		return status;
5346
5347	// directory is opened, create a fd
5348	status = get_new_fd(FDTYPE_ATTR_DIR, NULL, vnode, cookie, O_CLOEXEC,
5349		kernel);
5350	if (status >= 0)
5351		return status;
5352
5353	FS_CALL(vnode, close_attr_dir, cookie);
5354	FS_CALL(vnode, free_attr_dir_cookie, cookie);
5355
5356	return status;
5357}
5358
5359
5360static int
5361file_create_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5362	int openMode, int perms, bool kernel)
5363{
5364	FUNCTION(("file_create_entry_ref: name = '%s', omode %x, perms %d, "
5365		"kernel %d\n", name, openMode, perms, kernel));
5366
5367	// get directory to put the new file in
5368	struct vnode* directory;
5369	status_t status = get_vnode(mountID, directoryID, &directory, true, false);
5370	if (status != B_OK)
5371		return status;
5372
5373	status = create_vnode(directory, name, openMode, perms, kernel);
5374	put_vnode(directory);
5375
5376	return status;
5377}
5378
5379
5380static int
5381file_create(int fd, char* path, int openMode, int perms, bool kernel)
5382{
5383	FUNCTION(("file_create: path '%s', omode %x, perms %d, kernel %d\n", path,
5384		openMode, perms, kernel));
5385
5386	// get directory to put the new file in
5387	char name[B_FILE_NAME_LENGTH];
5388	struct vnode* directory;
5389	status_t status = fd_and_path_to_dir_vnode(fd, path, &directory, name,
5390		kernel);
5391	if (status < 0)
5392		return status;
5393
5394	status = create_vnode(directory, name, openMode, perms, kernel);
5395
5396	put_vnode(directory);
5397	return status;
5398}
5399
5400
5401static int
5402file_open_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5403	int openMode, bool kernel)
5404{
5405	if (name == NULL || *name == '\0')
5406		return B_BAD_VALUE;
5407
5408	FUNCTION(("file_open_entry_ref(ref = (%ld, %Ld, %s), openMode = %d)\n",
5409		mountID, directoryID, name, openMode));
5410
5411	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5412
5413	// get the vnode matching the entry_ref
5414	struct vnode* vnode;
5415	status_t status = entry_ref_to_vnode(mountID, directoryID, name, traverse,
5416		kernel, &vnode);
5417	if (status != B_OK)
5418		return status;
5419
5420	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
5421		put_vnode(vnode);
5422		return B_LINK_LIMIT;
5423	}
5424
5425	int newFD = open_vnode(vnode, openMode, kernel);
5426	if (newFD >= 0) {
5427		// The vnode reference has been transferred to the FD
5428		cache_node_opened(vnode, FDTYPE_FILE, vnode->cache, mountID,
5429			directoryID, vnode->id, name);
5430	} else
5431		put_vnode(vnode);
5432
5433	return newFD;
5434}
5435
5436
5437static int
5438file_open(int fd, char* path, int openMode, bool kernel)
5439{
5440	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5441
5442	FUNCTION(("file_open: fd: %d, entry path = '%s', omode %d, kernel %d\n",
5443		fd, path, openMode, kernel));
5444
5445	// get the vnode matching the vnode + path combination
5446	struct vnode* vnode;
5447	ino_t parentID;
5448	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode,
5449		&parentID, kernel);
5450	if (status != B_OK)
5451		return status;
5452
5453	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
5454		put_vnode(vnode);
5455		return B_LINK_LIMIT;
5456	}
5457
5458	// open the vnode
5459	int newFD = open_vnode(vnode, openMode, kernel);
5460	if (newFD >= 0) {
5461		// The vnode reference has been transferred to the FD
5462		cache_node_opened(vnode, FDTYPE_FILE, vnode->cache,
5463			vnode->device, parentID, vnode->id, NULL);
5464	} else
5465		put_vnode(vnode);
5466
5467	return newFD;
5468}
5469
5470
5471static status_t
5472file_close(struct file_descriptor* descriptor)
5473{
5474	struct vnode* vnode = descriptor->u.vnode;
5475	status_t status = B_OK;
5476
5477	FUNCTION(("file_close(descriptor = %p)\n", descriptor));
5478
5479	cache_node_closed(vnode, FDTYPE_FILE, vnode->cache, vnode->device,
5480		vnode->id);
5481	if (HAS_FS_CALL(vnode, close)) {
5482		status = FS_CALL(vnode, close, descriptor->cookie);
5483	}
5484
5485	if (status == B_OK) {
5486		// remove all outstanding locks for this team
5487		release_advisory_lock(vnode, NULL);
5488	}
5489	return status;
5490}
5491
5492
5493static void
5494file_free_fd(struct file_descriptor* descriptor)
5495{
5496	struct vnode* vnode = descriptor->u.vnode;
5497
5498	if (vnode != NULL) {
5499		FS_CALL(vnode, free_cookie, descriptor->cookie);
5500		put_vnode(vnode);
5501	}
5502}
5503
5504
5505static status_t
5506file_read(struct file_descriptor* descriptor, off_t pos, void* buffer,
5507	size_t* length)
5508{
5509	struct vnode* vnode = descriptor->u.vnode;
5510	FUNCTION(("file_read: buf %p, pos %Ld, len %p = %ld\n", buffer, pos, length,
5511		*length));
5512
5513	if (S_ISDIR(vnode->Type()))
5514		return B_IS_A_DIRECTORY;
5515
5516	return FS_CALL(vnode, read, descriptor->cookie, pos, buffer, length);
5517}
5518
5519
5520static status_t
5521file_write(struct file_descriptor* descriptor, off_t pos, const void* buffer,
5522	size_t* length)
5523{
5524	struct vnode* vnode = descriptor->u.vnode;
5525	FUNCTION(("file_write: buf %p, pos %Ld, len %p\n", buffer, pos, length));
5526
5527	if (S_ISDIR(vnode->Type()))
5528		return B_IS_A_DIRECTORY;
5529	if (!HAS_FS_CALL(vnode, write))
5530		return B_READ_ONLY_DEVICE;
5531
5532	return FS_CALL(vnode, write, descriptor->cookie, pos, buffer, length);
5533}
5534
5535
5536static off_t
5537file_seek(struct file_descriptor* descriptor, off_t pos, int seekType)
5538{
5539	struct vnode* vnode = descriptor->u.vnode;
5540	off_t offset;
5541
5542	FUNCTION(("file_seek(pos = %Ld, seekType = %d)\n", pos, seekType));
5543
5544	// some kinds of files are not seekable
5545	switch (vnode->Type() & S_IFMT) {
5546		case S_IFIFO:
5547		case S_IFSOCK:
5548			return ESPIPE;
5549
5550		// The Open Group Base Specs don't mention any file types besides pipes,
5551		// fifos, and sockets specially, so we allow seeking them.
5552		case S_IFREG:
5553		case S_IFBLK:
5554		case S_IFDIR:
5555		case S_IFLNK:
5556		case S_IFCHR:
5557			break;
5558	}
5559
5560	switch (seekType) {
5561		case SEEK_SET:
5562			offset = 0;
5563			break;
5564		case SEEK_CUR:
5565			offset = descriptor->pos;
5566			break;
5567		case SEEK_END:
5568		{
5569			// stat() the node
5570			if (!HAS_FS_CALL(vnode, read_stat))
5571				return B_UNSUPPORTED;
5572
5573			struct stat stat;
5574			status_t status = FS_CALL(vnode, read_stat, &stat);
5575			if (status != B_OK)
5576				return status;
5577
5578			offset = stat.st_size;
5579			break;
5580		}
5581		default:
5582			return B_BAD_VALUE;
5583	}
5584
5585	// assumes off_t is 64 bits wide
5586	if (offset > 0 && LONGLONG_MAX - offset < pos)
5587		return B_BUFFER_OVERFLOW;
5588
5589	pos += offset;
5590	if (pos < 0)
5591		return B_BAD_VALUE;
5592
5593	return descriptor->pos = pos;
5594}
5595
5596
5597static status_t
5598file_select(struct file_descriptor* descriptor, uint8 event,
5599	struct selectsync* sync)
5600{
5601	FUNCTION(("file_select(%p, %u, %p)\n", descriptor, event, sync));
5602
5603	struct vnode* vnode = descriptor->u.vnode;
5604
5605	// If the FS has no select() hook, notify select() now.
5606	if (!HAS_FS_CALL(vnode, select))
5607		return notify_select_event(sync, event);
5608
5609	return FS_CALL(vnode, select, descriptor->cookie, event, sync);
5610}
5611
5612
5613static status_t
5614file_deselect(struct file_descriptor* descriptor, uint8 event,
5615	struct selectsync* sync)
5616{
5617	struct vnode* vnode = descriptor->u.vnode;
5618
5619	if (!HAS_FS_CALL(vnode, deselect))
5620		return B_OK;
5621
5622	return FS_CALL(vnode, deselect, descriptor->cookie, event, sync);
5623}
5624
5625
5626static status_t
5627dir_create_entry_ref(dev_t mountID, ino_t parentID, const char* name, int perms,
5628	bool kernel)
5629{
5630	struct vnode* vnode;
5631	status_t status;
5632
5633	if (name == NULL || *name == '\0')
5634		return B_BAD_VALUE;
5635
5636	FUNCTION(("dir_create_entry_ref(dev = %ld, ino = %Ld, name = '%s', "
5637		"perms = %d)\n", mountID, parentID, name, perms));
5638
5639	status = get_vnode(mountID, parentID, &vnode, true, false);
5640	if (status != B_OK)
5641		return status;
5642
5643	if (HAS_FS_CALL(vnode, create_dir))
5644		status = FS_CALL(vnode, create_dir, name, perms);
5645	else
5646		status = B_READ_ONLY_DEVICE;
5647
5648	put_vnode(vnode);
5649	return status;
5650}
5651
5652
5653static status_t
5654dir_create(int fd, char* path, int perms, bool kernel)
5655{
5656	char filename[B_FILE_NAME_LENGTH];
5657	struct vnode* vnode;
5658	status_t status;
5659
5660	FUNCTION(("dir_create: path '%s', perms %d, kernel %d\n", path, perms,
5661		kernel));
5662
5663	status = fd_and_path_to_dir_vnode(fd, path, &vnode, filename, kernel);
5664	if (status < 0)
5665		return status;
5666
5667	if (HAS_FS_CALL(vnode, create_dir)) {
5668		status = FS_CALL(vnode, create_dir, filename, perms);
5669	} else
5670		status = B_READ_ONLY_DEVICE;
5671
5672	put_vnode(vnode);
5673	return status;
5674}
5675
5676
5677static int
5678dir_open_entry_ref(dev_t mountID, ino_t parentID, const char* name, bool kernel)
5679{
5680	FUNCTION(("dir_open_entry_ref()\n"));
5681
5682	if (name && name[0] == '\0')
5683		return B_BAD_VALUE;
5684
5685	// get the vnode matching the entry_ref/node_ref
5686	struct vnode* vnode;
5687	status_t status;
5688	if (name) {
5689		status = entry_ref_to_vnode(mountID, parentID, name, true, kernel,
5690			&vnode);
5691	} else
5692		status = get_vnode(mountID, parentID, &vnode, true, false);
5693	if (status != B_OK)
5694		return status;
5695
5696	int newFD = open_dir_vnode(vnode, kernel);
5697	if (newFD >= 0) {
5698		// The vnode reference has been transferred to the FD
5699		cache_node_opened(vnode, FDTYPE_DIR, vnode->cache, mountID, parentID,
5700			vnode->id, name);
5701	} else
5702		put_vnode(vnode);
5703
5704	return newFD;
5705}
5706
5707
5708static int
5709dir_open(int fd, char* path, bool kernel)
5710{
5711	FUNCTION(("dir_open: fd: %d, entry path = '%s', kernel %d\n", fd, path,
5712		kernel));
5713
5714	// get the vnode matching the vnode + path combination
5715	struct vnode* vnode = NULL;
5716	ino_t parentID;
5717	status_t status = fd_and_path_to_vnode(fd, path, true, &vnode, &parentID,
5718		kernel);
5719	if (status != B_OK)
5720		return status;
5721
5722	// open the dir
5723	int newFD = open_dir_vnode(vnode, kernel);
5724	if (newFD >= 0) {
5725		// The vnode reference has been transferred to the FD
5726		cache_node_opened(vnode, FDTYPE_DIR, vnode->cache, vnode->device,
5727			parentID, vnode->id, NULL);
5728	} else
5729		put_vnode(vnode);
5730
5731	return newFD;
5732}
5733
5734
5735static status_t
5736dir_close(struct file_descriptor* descriptor)
5737{
5738	struct vnode* vnode = descriptor->u.vnode;
5739
5740	FUNCTION(("dir_close(descriptor = %p)\n", descriptor));
5741
5742	cache_node_closed(vnode, FDTYPE_DIR, vnode->cache, vnode->device,
5743		vnode->id);
5744	if (HAS_FS_CALL(vnode, close_dir))
5745		return FS_CALL(vnode, close_dir, descriptor->cookie);
5746
5747	return B_OK;
5748}
5749
5750
5751static void
5752dir_free_fd(struct file_descriptor* descriptor)
5753{
5754	struct vnode* vnode = descriptor->u.vnode;
5755
5756	if (vnode != NULL) {
5757		FS_CALL(vnode, free_dir_cookie, descriptor->cookie);
5758		put_vnode(vnode);
5759	}
5760}
5761
5762
5763static status_t
5764dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
5765	struct dirent* buffer, size_t bufferSize, uint32* _count)
5766{
5767	return dir_read(ioContext, descriptor->u.vnode, descriptor->cookie, buffer,
5768		bufferSize, _count);
5769}
5770
5771
5772static status_t
5773fix_dirent(struct vnode* parent, struct dirent* entry,
5774	struct io_context* ioContext)
5775{
5776	// set d_pdev and d_pino
5777	entry->d_pdev = parent->device;
5778	entry->d_pino = parent->id;
5779
5780	// If this is the ".." entry and the directory covering another vnode,
5781	// we need to replace d_dev and d_ino with the actual values.
5782	if (strcmp(entry->d_name, "..") == 0 && parent->IsCovering()) {
5783		// Make sure the IO context root is not bypassed.
5784		if (parent == ioContext->root) {
5785			entry->d_dev = parent->device;
5786			entry->d_ino = parent->id;
5787		} else {
5788			inc_vnode_ref_count(parent);
5789				// vnode_path_to_vnode() puts the node
5790
5791			// ".." is guaranteed not to be clobbered by this call
5792			struct vnode* vnode;
5793			status_t status = vnode_path_to_vnode(parent, (char*)"..", false, 0,
5794				ioContext, &vnode, NULL);
5795
5796			if (status == B_OK) {
5797				entry->d_dev = vnode->device;
5798				entry->d_ino = vnode->id;
5799				put_vnode(vnode);
5800			}
5801		}
5802	} else {
5803		// resolve covered vnodes
5804		ReadLocker _(&sVnodeLock);
5805
5806		struct vnode* vnode = lookup_vnode(entry->d_dev, entry->d_ino);
5807		if (vnode != NULL && vnode->covered_by != NULL) {
5808			do {
5809				vnode = vnode->covered_by;
5810			} while (vnode->covered_by != NULL);
5811
5812			entry->d_dev = vnode->device;
5813			entry->d_ino = vnode->id;
5814		}
5815	}
5816
5817	return B_OK;
5818}
5819
5820
5821static status_t
5822dir_read(struct io_context* ioContext, struct vnode* vnode, void* cookie,
5823	struct dirent* buffer, size_t bufferSize, uint32* _count)
5824{
5825	if (!HAS_FS_CALL(vnode, read_dir))
5826		return B_UNSUPPORTED;
5827
5828	status_t error = FS_CALL(vnode, read_dir, cookie, buffer, bufferSize,
5829		_count);
5830	if (error != B_OK)
5831		return error;
5832
5833	// we need to adjust the read dirents
5834	uint32 count = *_count;
5835	for (uint32 i = 0; i < count; i++) {
5836		error = fix_dirent(vnode, buffer, ioContext);
5837		if (error != B_OK)
5838			return error;
5839
5840		buffer = (struct dirent*)((uint8*)buffer + buffer->d_reclen);
5841	}
5842
5843	return error;
5844}
5845
5846
5847static status_t
5848dir_rewind(struct file_descriptor* descriptor)
5849{
5850	struct vnode* vnode = descriptor->u.vnode;
5851
5852	if (HAS_FS_CALL(vnode, rewind_dir)) {
5853		return FS_CALL(vnode, rewind_dir, descriptor->cookie);
5854	}
5855
5856	return B_UNSUPPORTED;
5857}
5858
5859
5860static status_t
5861dir_remove(int fd, char* path, bool kernel)
5862{
5863	char name[B_FILE_NAME_LENGTH];
5864	struct vnode* directory;
5865	status_t status;
5866
5867	if (path != NULL) {
5868		// we need to make sure our path name doesn't stop with "/", ".",
5869		// or ".."
5870		char* lastSlash;
5871		while ((lastSlash = strrchr(path, '/')) != NULL) {
5872			char* leaf = lastSlash + 1;
5873			if (!strcmp(leaf, ".."))
5874				return B_NOT_ALLOWED;
5875
5876			// omit multiple slashes
5877			while (lastSlash > path && lastSlash[-1] == '/')
5878				lastSlash--;
5879
5880			if (leaf[0]
5881				&& strcmp(leaf, ".")) {
5882				break;
5883			}
5884			// "name/" -> "name", or "name/." -> "name"
5885			lastSlash[0] = '\0';
5886		}
5887
5888		if (!strcmp(path, ".") || !strcmp(path, ".."))
5889			return B_NOT_ALLOWED;
5890	}
5891
5892	status = fd_and_path_to_dir_vnode(fd, path, &directory, name, kernel);
5893	if (status != B_OK)
5894		return status;
5895
5896	if (HAS_FS_CALL(directory, remove_dir))
5897		status = FS_CALL(directory, remove_dir, name);
5898	else
5899		status = B_READ_ONLY_DEVICE;
5900
5901	put_vnode(directory);
5902	return status;
5903}
5904
5905
5906static status_t
5907common_ioctl(struct file_descriptor* descriptor, ulong op, void* buffer,
5908	size_t length)
5909{
5910	struct vnode* vnode = descriptor->u.vnode;
5911
5912	if (HAS_FS_CALL(vnode, ioctl))
5913		return FS_CALL(vnode, ioctl, descriptor->cookie, op, buffer, length);
5914
5915	return B_DEV_INVALID_IOCTL;
5916}
5917
5918
5919static status_t
5920common_fcntl(int fd, int op, size_t argument, bool kernel)
5921{
5922	struct flock flock;
5923
5924	FUNCTION(("common_fcntl(fd = %d, op = %d, argument = %lx, %s)\n",
5925		fd, op, argument, kernel ? "kernel" : "user"));
5926
5927	struct file_descriptor* descriptor = get_fd(get_current_io_context(kernel),
5928		fd);
5929	if (descriptor == NULL)
5930		return B_FILE_ERROR;
5931
5932	struct vnode* vnode = fd_vnode(descriptor);
5933
5934	status_t status = B_OK;
5935
5936	if (op == F_SETLK || op == F_SETLKW || op == F_GETLK) {
5937		if (descriptor->type != FDTYPE_FILE)
5938			status = B_BAD_VALUE;
5939		else if (user_memcpy(&flock, (struct flock*)argument,
5940				sizeof(struct flock)) != B_OK)
5941			status = B_BAD_ADDRESS;
5942
5943		if (status != B_OK) {
5944			put_fd(descriptor);
5945			return status;
5946		}
5947	}
5948
5949	switch (op) {
5950		case F_SETFD:
5951		{
5952			struct io_context* context = get_current_io_context(kernel);
5953			// Set file descriptor flags
5954
5955			// O_CLOEXEC is the only flag available at this time
5956			mutex_lock(&context->io_mutex);
5957			fd_set_close_on_exec(context, fd, (argument & FD_CLOEXEC) != 0);
5958			mutex_unlock(&context->io_mutex);
5959
5960			status = B_OK;
5961			break;
5962		}
5963
5964		case F_GETFD:
5965		{
5966			struct io_context* context = get_current_io_context(kernel);
5967
5968			// Get file descriptor flags
5969			mutex_lock(&context->io_mutex);
5970			status = fd_close_on_exec(context, fd) ? FD_CLOEXEC : 0;
5971			mutex_unlock(&context->io_mutex);
5972			break;
5973		}
5974
5975		case F_SETFL:
5976			// Set file descriptor open mode
5977
5978			// we only accept changes to O_APPEND and O_NONBLOCK
5979			argument &= O_APPEND | O_NONBLOCK;
5980			if (descriptor->ops->fd_set_flags != NULL) {
5981				status = descriptor->ops->fd_set_flags(descriptor, argument);
5982			} else if (vnode != NULL && HAS_FS_CALL(vnode, set_flags)) {
5983				status = FS_CALL(vnode, set_flags, descriptor->cookie,
5984					(int)argument);
5985			} else
5986				status = B_UNSUPPORTED;
5987
5988			if (status == B_OK) {
5989				// update this descriptor's open_mode field
5990				descriptor->open_mode = (descriptor->open_mode
5991					& ~(O_APPEND | O_NONBLOCK)) | argument;
5992			}
5993
5994			break;
5995
5996		case F_GETFL:
5997			// Get file descriptor open mode
5998			status = descriptor->open_mode;
5999			break;
6000
6001		case F_DUPFD:
6002		{
6003			struct io_context* context = get_current_io_context(kernel);
6004
6005			status = new_fd_etc(context, descriptor, (int)argument);
6006			if (status >= 0) {
6007				mutex_lock(&context->io_mutex);
6008				fd_set_close_on_exec(context, fd, false);
6009				mutex_unlock(&context->io_mutex);
6010
6011				atomic_add(&descriptor->ref_count, 1);
6012			}
6013			break;
6014		}
6015
6016		case F_GETLK:
6017			if (vnode != NULL) {
6018				status = get_advisory_lock(vnode, &flock);
6019				if (status == B_OK) {
6020					// copy back flock structure
6021					status = user_memcpy((struct flock*)argument, &flock,
6022						sizeof(struct flock));
6023				}
6024			} else
6025				status = B_BAD_VALUE;
6026			break;
6027
6028		case F_SETLK:
6029		case F_SETLKW:
6030			status = normalize_flock(descriptor, &flock);
6031			if (status != B_OK)
6032				break;
6033
6034			if (vnode == NULL) {
6035				status = B_BAD_VALUE;
6036			} else if (flock.l_type == F_UNLCK) {
6037				status = release_advisory_lock(vnode, &flock);
6038			} else {
6039				// the open mode must match the lock type
6040				if (((descriptor->open_mode & O_RWMASK) == O_RDONLY
6041						&& flock.l_type == F_WRLCK)
6042					|| ((descriptor->open_mode & O_RWMASK) == O_WRONLY
6043						&& flock.l_type == F_RDLCK))
6044					status = B_FILE_ERROR;
6045				else {
6046					status = acquire_advisory_lock(vnode, -1,
6047						&flock, op == F_SETLKW);
6048				}
6049			}
6050			break;
6051
6052		// ToDo: add support for more ops?
6053
6054		default:
6055			status = B_BAD_VALUE;
6056	}
6057
6058	put_fd(descriptor);
6059	return status;
6060}
6061
6062
6063static status_t
6064common_sync(int fd, bool kernel)
6065{
6066	struct file_descriptor* descriptor;
6067	struct vnode* vnode;
6068	status_t status;
6069
6070	FUNCTION(("common_fsync: entry. fd %d kernel %d\n", fd, kernel));
6071
6072	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6073	if (descriptor == NULL)
6074		return B_FILE_ERROR;
6075
6076	if (HAS_FS_CALL(vnode, fsync))
6077		status = FS_CALL_NO_PARAMS(vnode, fsync);
6078	else
6079		status = B_UNSUPPORTED;
6080
6081	put_fd(descriptor);
6082	return status;
6083}
6084
6085
6086static status_t
6087common_lock_node(int fd, bool kernel)
6088{
6089	struct file_descriptor* descriptor;
6090	struct vnode* vnode;
6091
6092	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6093	if (descriptor == NULL)
6094		return B_FILE_ERROR;
6095
6096	status_t status = B_OK;
6097
6098	// We need to set the locking atomically - someone
6099	// else might set one at the same time
6100	if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by, descriptor,
6101			(file_descriptor*)NULL) != NULL)
6102		status = B_BUSY;
6103
6104	put_fd(descriptor);
6105	return status;
6106}
6107
6108
6109static status_t
6110common_unlock_node(int fd, bool kernel)
6111{
6112	struct file_descriptor* descriptor;
6113	struct vnode* vnode;
6114
6115	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6116	if (descriptor == NULL)
6117		return B_FILE_ERROR;
6118
6119	status_t status = B_OK;
6120
6121	// We need to set the locking atomically - someone
6122	// else might set one at the same time
6123	if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by,
6124			(file_descriptor*)NULL, descriptor) != descriptor)
6125		status = B_BAD_VALUE;
6126
6127	put_fd(descriptor);
6128	return status;
6129}
6130
6131
6132static status_t
6133common_read_link(int fd, char* path, char* buffer, size_t* _bufferSize,
6134	bool kernel)
6135{
6136	struct vnode* vnode;
6137	status_t status;
6138
6139	status = fd_and_path_to_vnode(fd, path, false, &vnode, NULL, kernel);
6140	if (status != B_OK)
6141		return status;
6142
6143	if (HAS_FS_CALL(vnode, read_symlink)) {
6144		status = FS_CALL(vnode, read_symlink, buffer, _bufferSize);
6145	} else
6146		status = B_BAD_VALUE;
6147
6148	put_vnode(vnode);
6149	return status;
6150}
6151
6152
6153static status_t
6154common_create_symlink(int fd, char* path, const char* toPath, int mode,
6155	bool kernel)
6156{
6157	// path validity checks have to be in the calling function!
6158	char name[B_FILE_NAME_LENGTH];
6159	struct vnode* vnode;
6160	status_t status;
6161
6162	FUNCTION(("common_create_symlink(fd = %d, path = %s, toPath = %s, "
6163		"mode = %d, kernel = %d)\n", fd, path, toPath, mode, kernel));
6164
6165	status = fd_and_path_to_dir_vnode(fd, path, &vnode, name, kernel);
6166	if (status != B_OK)
6167		return status;
6168
6169	if (HAS_FS_CALL(vnode, create_symlink))
6170		status = FS_CALL(vnode, create_symlink, name, toPath, mode);
6171	else {
6172		status = HAS_FS_CALL(vnode, write)
6173			? B_UNSUPPORTED : B_READ_ONLY_DEVICE;
6174	}
6175
6176	put_vnode(vnode);
6177
6178	return status;
6179}
6180
6181
6182static status_t
6183common_create_link(int pathFD, char* path, int toFD, char* toPath,
6184	bool traverseLeafLink, bool kernel)
6185{
6186	// path validity checks have to be in the calling function!
6187
6188	FUNCTION(("common_create_link(path = %s, toPath = %s, kernel = %d)\n", path,
6189		toPath, kernel));
6190
6191	char name[B_FILE_NAME_LENGTH];
6192	struct vnode* directory;
6193	status_t status = fd_and_path_to_dir_vnode(pathFD, path, &directory, name,
6194		kernel);
6195	if (status != B_OK)
6196		return status;
6197
6198	struct vnode* vnode;
6199	status = fd_and_path_to_vnode(toFD, toPath, traverseLeafLink, &vnode, NULL,
6200		kernel);
6201	if (status != B_OK)
6202		goto err;
6203
6204	if (directory->mount != vnode->mount) {
6205		status = B_CROSS_DEVICE_LINK;
6206		goto err1;
6207	}
6208
6209	if (HAS_FS_CALL(directory, link))
6210		status = FS_CALL(directory, link, name, vnode);
6211	else
6212		status = B_READ_ONLY_DEVICE;
6213
6214err1:
6215	put_vnode(vnode);
6216err:
6217	put_vnode(directory);
6218
6219	return status;
6220}
6221
6222
6223static status_t
6224common_unlink(int fd, char* path, bool kernel)
6225{
6226	char filename[B_FILE_NAME_LENGTH];
6227	struct vnode* vnode;
6228	status_t status;
6229
6230	FUNCTION(("common_unlink: fd: %d, path '%s', kernel %d\n", fd, path,
6231		kernel));
6232
6233	status = fd_and_path_to_dir_vnode(fd, path, &vnode, filename, kernel);
6234	if (status < 0)
6235		return status;
6236
6237	if (HAS_FS_CALL(vnode, unlink))
6238		status = FS_CALL(vnode, unlink, filename);
6239	else
6240		status = B_READ_ONLY_DEVICE;
6241
6242	put_vnode(vnode);
6243
6244	return status;
6245}
6246
6247
6248static status_t
6249common_access(int fd, char* path, int mode, bool effectiveUserGroup, bool kernel)
6250{
6251	struct vnode* vnode;
6252	status_t status;
6253
6254	// TODO: honor effectiveUserGroup argument
6255
6256	status = fd_and_path_to_vnode(fd, path, true, &vnode, NULL, kernel);
6257	if (status != B_OK)
6258		return status;
6259
6260	if (HAS_FS_CALL(vnode, access))
6261		status = FS_CALL(vnode, access, mode);
6262	else
6263		status = B_OK;
6264
6265	put_vnode(vnode);
6266
6267	return status;
6268}
6269
6270
6271static status_t
6272common_rename(int fd, char* path, int newFD, char* newPath, bool kernel)
6273{
6274	struct vnode* fromVnode;
6275	struct vnode* toVnode;
6276	char fromName[B_FILE_NAME_LENGTH];
6277	char toName[B_FILE_NAME_LENGTH];
6278	status_t status;
6279
6280	FUNCTION(("common_rename(fd = %d, path = %s, newFD = %d, newPath = %s, "
6281		"kernel = %d)\n", fd, path, newFD, newPath, kernel));
6282
6283	status = fd_and_path_to_dir_vnode(fd, path, &fromVnode, fromName, kernel);
6284	if (status != B_OK)
6285		return status;
6286
6287	status = fd_and_path_to_dir_vnode(newFD, newPath, &toVnode, toName, kernel);
6288	if (status != B_OK)
6289		goto err1;
6290
6291	if (fromVnode->device != toVnode->device) {
6292		status = B_CROSS_DEVICE_LINK;
6293		goto err2;
6294	}
6295
6296	if (fromName[0] == '\0' || toName[0] == '\0'
6297		|| !strcmp(fromName, ".") || !strcmp(fromName, "..")
6298		|| !strcmp(toName, ".") || !strcmp(toName, "..")
6299		|| (fromVnode == toVnode && !strcmp(fromName, toName))) {
6300		status = B_BAD_VALUE;
6301		goto err2;
6302	}
6303
6304	if (HAS_FS_CALL(fromVnode, rename))
6305		status = FS_CALL(fromVnode, rename, fromName, toVnode, toName);
6306	else
6307		status = B_READ_ONLY_DEVICE;
6308
6309err2:
6310	put_vnode(toVnode);
6311err1:
6312	put_vnode(fromVnode);
6313
6314	return status;
6315}
6316
6317
6318static status_t
6319common_read_stat(struct file_descriptor* descriptor, struct stat* stat)
6320{
6321	struct vnode* vnode = descriptor->u.vnode;
6322
6323	FUNCTION(("common_read_stat: stat %p\n", stat));
6324
6325	// TODO: remove this once all file systems properly set them!
6326	stat->st_crtim.tv_nsec = 0;
6327	stat->st_ctim.tv_nsec = 0;
6328	stat->st_mtim.tv_nsec = 0;
6329	stat->st_atim.tv_nsec = 0;
6330
6331	status_t status = FS_CALL(vnode, read_stat, stat);
6332
6333	// fill in the st_dev and st_ino fields
6334	if (status == B_OK) {
6335		stat->st_dev = vnode->device;
6336		stat->st_ino = vnode->id;
6337		stat->st_rdev = -1;
6338	}
6339
6340	return status;
6341}
6342
6343
6344static status_t
6345common_write_stat(struct file_descriptor* descriptor, const struct stat* stat,
6346	int statMask)
6347{
6348	struct vnode* vnode = descriptor->u.vnode;
6349
6350	FUNCTION(("common_write_stat(vnode = %p, stat = %p, statMask = %d)\n",
6351		vnode, stat, statMask));
6352
6353	if (!HAS_FS_CALL(vnode, write_stat))
6354		return B_READ_ONLY_DEVICE;
6355
6356	return FS_CALL(vnode, write_stat, stat, statMask);
6357}
6358
6359
6360static status_t
6361common_path_read_stat(int fd, char* path, bool traverseLeafLink,
6362	struct stat* stat, bool kernel)
6363{
6364	FUNCTION(("common_path_read_stat: fd: %d, path '%s', stat %p,\n", fd, path,
6365		stat));
6366
6367	struct vnode* vnode;
6368	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6369		NULL, kernel);
6370	if (status != B_OK)
6371		return status;
6372
6373	status = FS_CALL(vnode, read_stat, stat);
6374
6375	// fill in the st_dev and st_ino fields
6376	if (status == B_OK) {
6377		stat->st_dev = vnode->device;
6378		stat->st_ino = vnode->id;
6379		stat->st_rdev = -1;
6380	}
6381
6382	put_vnode(vnode);
6383	return status;
6384}
6385
6386
6387static status_t
6388common_path_write_stat(int fd, char* path, bool traverseLeafLink,
6389	const struct stat* stat, int statMask, bool kernel)
6390{
6391	FUNCTION(("common_write_stat: fd: %d, path '%s', stat %p, stat_mask %d, "
6392		"kernel %d\n", fd, path, stat, statMask, kernel));
6393
6394	struct vnode* vnode;
6395	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6396		NULL, kernel);
6397	if (status != B_OK)
6398		return status;
6399
6400	if (HAS_FS_CALL(vnode, write_stat))
6401		status = FS_CALL(vnode, write_stat, stat, statMask);
6402	else
6403		status = B_READ_ONLY_DEVICE;
6404
6405	put_vnode(vnode);
6406
6407	return status;
6408}
6409
6410
6411static int
6412attr_dir_open(int fd, char* path, bool traverseLeafLink, bool kernel)
6413{
6414	FUNCTION(("attr_dir_open(fd = %d, path = '%s', kernel = %d)\n", fd, path,
6415		kernel));
6416
6417	struct vnode* vnode;
6418	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6419		NULL, kernel);
6420	if (status != B_OK)
6421		return status;
6422
6423	status = open_attr_dir_vnode(vnode, kernel);
6424	if (status < 0)
6425		put_vnode(vnode);
6426
6427	return status;
6428}
6429
6430
6431static status_t
6432attr_dir_close(struct file_descriptor* descriptor)
6433{
6434	struct vnode* vnode = descriptor->u.vnode;
6435
6436	FUNCTION(("attr_dir_close(descriptor = %p)\n", descriptor));
6437
6438	if (HAS_FS_CALL(vnode, close_attr_dir))
6439		return FS_CALL(vnode, close_attr_dir, descriptor->cookie);
6440
6441	return B_OK;
6442}
6443
6444
6445static void
6446attr_dir_free_fd(struct file_descriptor* descriptor)
6447{
6448	struct vnode* vnode = descriptor->u.vnode;
6449
6450	if (vnode != NULL) {
6451		FS_CALL(vnode, free_attr_dir_cookie, descriptor->cookie);
6452		put_vnode(vnode);
6453	}
6454}
6455
6456
6457static status_t
6458attr_dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
6459	struct dirent* buffer, size_t bufferSize, uint32* _count)
6460{
6461	struct vnode* vnode = descriptor->u.vnode;
6462
6463	FUNCTION(("attr_dir_read(descriptor = %p)\n", descriptor));
6464
6465	if (HAS_FS_CALL(vnode, read_attr_dir))
6466		return FS_CALL(vnode, read_attr_dir, descriptor->cookie, buffer,
6467			bufferSize, _count);
6468
6469	return B_UNSUPPORTED;
6470}
6471
6472
6473static status_t
6474attr_dir_rewind(struct file_descriptor* descriptor)
6475{
6476	struct vnode* vnode = descriptor->u.vnode;
6477
6478	FUNCTION(("attr_dir_rewind(descriptor = %p)\n", descriptor));
6479
6480	if (HAS_FS_CALL(vnode, rewind_attr_dir))
6481		return FS_CALL(vnode, rewind_attr_dir, descriptor->cookie);
6482
6483	return B_UNSUPPORTED;
6484}
6485
6486
6487static int
6488attr_create(int fd, char* path, const char* name, uint32 type,
6489	int openMode, bool kernel)
6490{
6491	if (name == NULL || *name == '\0')
6492		return B_BAD_VALUE;
6493
6494	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
6495	struct vnode* vnode;
6496	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode, NULL,
6497		kernel);
6498	if (status != B_OK)
6499		return status;
6500
6501	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
6502		status = B_LINK_LIMIT;
6503		goto err;
6504	}
6505
6506	if (!HAS_FS_CALL(vnode, create_attr)) {
6507		status = B_READ_ONLY_DEVICE;
6508		goto err;
6509	}
6510
6511	void* cookie;
6512	status = FS_CALL(vnode, create_attr, name, type, openMode, &cookie);
6513	if (status != B_OK)
6514		goto err;
6515
6516	fd = get_new_fd(FDTYPE_ATTR, NULL, vnode, cookie, openMode, kernel);
6517	if (fd >= 0)
6518		return fd;
6519
6520	status = fd;
6521
6522	FS_CALL(vnode, close_attr, cookie);
6523	FS_CALL(vnode, free_attr_cookie, cookie);
6524
6525	FS_CALL(vnode, remove_attr, name);
6526
6527err:
6528	put_vnode(vnode);
6529
6530	return status;
6531}
6532
6533
6534static int
6535attr_open(int fd, char* path, const char* name, int openMode, bool kernel)
6536{
6537	if (name == NULL || *name == '\0')
6538		return B_BAD_VALUE;
6539
6540	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
6541	struct vnode* vnode;
6542	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode, NULL,
6543		kernel);
6544	if (status != B_OK)
6545		return status;
6546
6547	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
6548		status = B_LINK_LIMIT;
6549		goto err;
6550	}
6551
6552	if (!HAS_FS_CALL(vnode, open_attr)) {
6553		status = B_UNSUPPORTED;
6554		goto err;
6555	}
6556
6557	void* cookie;
6558	status = FS_CALL(vnode, open_attr, name, openMode, &cookie);
6559	if (status != B_OK)
6560		goto err;
6561
6562	// now we only need a file descriptor for this attribute and we're done
6563	fd = get_new_fd(FDTYPE_ATTR, NULL, vnode, cookie, openMode, kernel);
6564	if (fd >= 0)
6565		return fd;
6566
6567	status = fd;
6568
6569	FS_CALL(vnode, close_attr, cookie);
6570	FS_CALL(vnode, free_attr_cookie, cookie);
6571
6572err:
6573	put_vnode(vnode);
6574
6575	return status;
6576}
6577
6578
6579static status_t
6580attr_close(struct file_descriptor* descriptor)
6581{
6582	struct vnode* vnode = descriptor->u.vnode;
6583
6584	FUNCTION(("attr_close(descriptor = %p)\n", descriptor));
6585
6586	if (HAS_FS_CALL(vnode, close_attr))
6587		return FS_CALL(vnode, close_attr, descriptor->cookie);
6588
6589	return B_OK;
6590}
6591
6592
6593static void
6594attr_free_fd(struct file_descriptor* descriptor)
6595{
6596	struct vnode* vnode = descriptor->u.vnode;
6597
6598	if (vnode != NULL) {
6599		FS_CALL(vnode, free_attr_cookie, descriptor->cookie);
6600		put_vnode(vnode);
6601	}
6602}
6603
6604
6605static status_t
6606attr_read(struct file_descriptor* descriptor, off_t pos, void* buffer,
6607	size_t* length)
6608{
6609	struct vnode* vnode = descriptor->u.vnode;
6610
6611	FUNCTION(("attr_read: buf %p, pos %Ld, len %p = %ld\n", buffer, pos, length,
6612		*length));
6613
6614	if (!HAS_FS_CALL(vnode, read_attr))
6615		return B_UNSUPPORTED;
6616
6617	return FS_CALL(vnode, read_attr, descriptor->cookie, pos, buffer, length);
6618}
6619
6620
6621static status_t
6622attr_write(struct file_descriptor* descriptor, off_t pos, const void* buffer,
6623	size_t* length)
6624{
6625	struct vnode* vnode = descriptor->u.vnode;
6626
6627	FUNCTION(("attr_write: buf %p, pos %Ld, len %p\n", buffer, pos, length));
6628	if (!HAS_FS_CALL(vnode, write_attr))
6629		return B_UNSUPPORTED;
6630
6631	return FS_CALL(vnode, write_attr, descriptor->cookie, pos, buffer, length);
6632}
6633
6634
6635static off_t
6636attr_seek(struct file_descriptor* descriptor, off_t pos, int seekType)
6637{
6638	off_t offset;
6639
6640	switch (seekType) {
6641		case SEEK_SET:
6642			offset = 0;
6643			break;
6644		case SEEK_CUR:
6645			offset = descriptor->pos;
6646			break;
6647		case SEEK_END:
6648		{
6649			struct vnode* vnode = descriptor->u.vnode;
6650			if (!HAS_FS_CALL(vnode, read_stat))
6651				return B_UNSUPPORTED;
6652
6653			struct stat stat;
6654			status_t status = FS_CALL(vnode, read_attr_stat, descriptor->cookie,
6655				&stat);
6656			if (status != B_OK)
6657				return status;
6658
6659			offset = stat.st_size;
6660			break;
6661		}
6662		default:
6663			return B_BAD_VALUE;
6664	}
6665
6666	// assumes off_t is 64 bits wide
6667	if (offset > 0 && LONGLONG_MAX - offset < pos)
6668		return B_BUFFER_OVERFLOW;
6669
6670	pos += offset;
6671	if (pos < 0)
6672		return B_BAD_VALUE;
6673
6674	return descriptor->pos = pos;
6675}
6676
6677
6678static status_t
6679attr_read_stat(struct file_descriptor* descriptor, struct stat* stat)
6680{
6681	struct vnode* vnode = descriptor->u.vnode;
6682
6683	FUNCTION(("attr_read_stat: stat 0x%p\n", stat));
6684
6685	if (!HAS_FS_CALL(vnode, read_attr_stat))
6686		return B_UNSUPPORTED;
6687
6688	return FS_CALL(vnode, read_attr_stat, descriptor->cookie, stat);
6689}
6690
6691
6692static status_t
6693attr_write_stat(struct file_descriptor* descriptor, const struct stat* stat,
6694	int statMask)
6695{
6696	struct vnode* vnode = descriptor->u.vnode;
6697
6698	FUNCTION(("attr_write_stat: stat = %p, statMask %d\n", stat, statMask));
6699
6700	if (!HAS_FS_CALL(vnode, write_attr_stat))
6701		return B_READ_ONLY_DEVICE;
6702
6703	return FS_CALL(vnode, write_attr_stat, descriptor->cookie, stat, statMask);
6704}
6705
6706
6707static status_t
6708attr_remove(int fd, const char* name, bool kernel)
6709{
6710	struct file_descriptor* descriptor;
6711	struct vnode* vnode;
6712	status_t status;
6713
6714	if (name == NULL || *name == '\0')
6715		return B_BAD_VALUE;
6716
6717	FUNCTION(("attr_remove: fd = %d, name = \"%s\", kernel %d\n", fd, name,
6718		kernel));
6719
6720	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6721	if (descriptor == NULL)
6722		return B_FILE_ERROR;
6723
6724	if (HAS_FS_CALL(vnode, remove_attr))
6725		status = FS_CALL(vnode, remove_attr, name);
6726	else
6727		status = B_READ_ONLY_DEVICE;
6728
6729	put_fd(descriptor);
6730
6731	return status;
6732}
6733
6734
6735static status_t
6736attr_rename(int fromFD, const char* fromName, int toFD, const char* toName,
6737	bool kernel)
6738{
6739	struct file_descriptor* fromDescriptor;
6740	struct file_descriptor* toDescriptor;
6741	struct vnode* fromVnode;
6742	struct vnode* toVnode;
6743	status_t status;
6744
6745	if (fromName == NULL || *fromName == '\0' || toName == NULL
6746		|| *toName == '\0')
6747		return B_BAD_VALUE;
6748
6749	FUNCTION(("attr_rename: from fd = %d, from name = \"%s\", to fd = %d, to "
6750		"name = \"%s\", kernel %d\n", fromFD, fromName, toFD, toName, kernel));
6751
6752	fromDescriptor = get_fd_and_vnode(fromFD, &fromVnode, kernel);
6753	if (fromDescriptor == NULL)
6754		return B_FILE_ERROR;
6755
6756	toDescriptor = get_fd_and_vnode(toFD, &toVnode, kernel);
6757	if (toDescriptor == NULL) {
6758		status = B_FILE_ERROR;
6759		goto err;
6760	}
6761
6762	// are the files on the same volume?
6763	if (fromVnode->device != toVnode->device) {
6764		status = B_CROSS_DEVICE_LINK;
6765		goto err1;
6766	}
6767
6768	if (HAS_FS_CALL(fromVnode, rename_attr)) {
6769		status = FS_CALL(fromVnode, rename_attr, fromName, toVnode, toName);
6770	} else
6771		status = B_READ_ONLY_DEVICE;
6772
6773err1:
6774	put_fd(toDescriptor);
6775err:
6776	put_fd(fromDescriptor);
6777
6778	return status;
6779}
6780
6781
6782static int
6783index_dir_open(dev_t mountID, bool kernel)
6784{
6785	struct fs_mount* mount;
6786	void* cookie;
6787
6788	FUNCTION(("index_dir_open(mountID = %ld, kernel = %d)\n", mountID, kernel));
6789
6790	status_t status = get_mount(mountID, &mount);
6791	if (status != B_OK)
6792		return status;
6793
6794	if (!HAS_FS_MOUNT_CALL(mount, open_index_dir)) {
6795		status = B_UNSUPPORTED;
6796		goto error;
6797	}
6798
6799	status = FS_MOUNT_CALL(mount, open_index_dir, &cookie);
6800	if (status != B_OK)
6801		goto error;
6802
6803	// get fd for the index directory
6804	int fd;
6805	fd = get_new_fd(FDTYPE_INDEX_DIR, mount, NULL, cookie, O_CLOEXEC, kernel);
6806	if (fd >= 0)
6807		return fd;
6808
6809	// something went wrong
6810	FS_MOUNT_CALL(mount, close_index_dir, cookie);
6811	FS_MOUNT_CALL(mount, free_index_dir_cookie, cookie);
6812
6813	status = fd;
6814
6815error:
6816	put_mount(mount);
6817	return status;
6818}
6819
6820
6821static status_t
6822index_dir_close(struct file_descriptor* descriptor)
6823{
6824	struct fs_mount* mount = descriptor->u.mount;
6825
6826	FUNCTION(("index_dir_close(descriptor = %p)\n", descriptor));
6827
6828	if (HAS_FS_MOUNT_CALL(mount, close_index_dir))
6829		return FS_MOUNT_CALL(mount, close_index_dir, descriptor->cookie);
6830
6831	return B_OK;
6832}
6833
6834
6835static void
6836index_dir_free_fd(struct file_descriptor* descriptor)
6837{
6838	struct fs_mount* mount = descriptor->u.mount;
6839
6840	if (mount != NULL) {
6841		FS_MOUNT_CALL(mount, free_index_dir_cookie, descriptor->cookie);
6842		put_mount(mount);
6843	}
6844}
6845
6846
6847static status_t
6848index_dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
6849	struct dirent* buffer, size_t bufferSize, uint32* _count)
6850{
6851	struct fs_mount* mount = descriptor->u.mount;
6852
6853	if (HAS_FS_MOUNT_CALL(mount, read_index_dir)) {
6854		return FS_MOUNT_CALL(mount, read_index_dir, descriptor->cookie, buffer,
6855			bufferSize, _count);
6856	}
6857
6858	return B_UNSUPPORTED;
6859}
6860
6861
6862static status_t
6863index_dir_rewind(struct file_descriptor* descriptor)
6864{
6865	struct fs_mount* mount = descriptor->u.mount;
6866
6867	if (HAS_FS_MOUNT_CALL(mount, rewind_index_dir))
6868		return FS_MOUNT_CALL(mount, rewind_index_dir, descriptor->cookie);
6869
6870	return B_UNSUPPORTED;
6871}
6872
6873
6874static status_t
6875index_create(dev_t mountID, const char* name, uint32 type, uint32 flags,
6876	bool kernel)
6877{
6878	FUNCTION(("index_create(mountID = %ld, name = %s, kernel = %d)\n", mountID,
6879		name, kernel));
6880
6881	struct fs_mount* mount;
6882	status_t status = get_mount(mountID, &mount);
6883	if (status != B_OK)
6884		return status;
6885
6886	if (!HAS_FS_MOUNT_CALL(mount, create_index)) {
6887		status = B_READ_ONLY_DEVICE;
6888		goto out;
6889	}
6890
6891	status = FS_MOUNT_CALL(mount, create_index, name, type, flags);
6892
6893out:
6894	put_mount(mount);
6895	return status;
6896}
6897
6898
6899#if 0
6900static status_t
6901index_read_stat(struct file_descriptor* descriptor, struct stat* stat)
6902{
6903	struct vnode* vnode = descriptor->u.vnode;
6904
6905	// ToDo: currently unused!
6906	FUNCTION(("index_read_stat: stat 0x%p\n", stat));
6907	if (!HAS_FS_CALL(vnode, read_index_stat))
6908		return B_UNSUPPORTED;
6909
6910	return B_UNSUPPORTED;
6911	//return FS_CALL(vnode, read_index_stat, descriptor->cookie, stat);
6912}
6913
6914
6915static void
6916index_free_fd(struct file_descriptor* descriptor)
6917{
6918	struct vnode* vnode = descriptor->u.vnode;
6919
6920	if (vnode != NULL) {
6921		FS_CALL(vnode, free_index_cookie, descriptor->cookie);
6922		put_vnode(vnode);
6923	}
6924}
6925#endif
6926
6927
6928static status_t
6929index_name_read_stat(dev_t mountID, const char* name, struct stat* stat,
6930	bool kernel)
6931{
6932	FUNCTION(("index_remove(mountID = %ld, name = %s, kernel = %d)\n", mountID,
6933		name, kernel));
6934
6935	struct fs_mount* mount;
6936	status_t status = get_mount(mountID, &mount);
6937	if (status != B_OK)
6938		return status;
6939
6940	if (!HAS_FS_MOUNT_CALL(mount, read_index_stat)) {
6941		status = B_UNSUPPORTED;
6942		goto out;
6943	}
6944
6945	status = FS_MOUNT_CALL(mount, read_index_stat, name, stat);
6946
6947out:
6948	put_mount(mount);
6949	return status;
6950}
6951
6952
6953static status_t
6954index_remove(dev_t mountID, const char* name, bool kernel)
6955{
6956	FUNCTION(("index_remove(mountID = %ld, name = %s, kernel = %d)\n", mountID,
6957		name, kernel));
6958
6959	struct fs_mount* mount;
6960	status_t status = get_mount(mountID, &mount);
6961	if (status != B_OK)
6962		return status;
6963
6964	if (!HAS_FS_MOUNT_CALL(mount, remove_index)) {
6965		status = B_READ_ONLY_DEVICE;
6966		goto out;
6967	}
6968
6969	status = FS_MOUNT_CALL(mount, remove_index, name);
6970
6971out:
6972	put_mount(mount);
6973	return status;
6974}
6975
6976
6977/*!	TODO: the query FS API is still the pretty much the same as in R5.
6978		It would be nice if the FS would find some more kernel support
6979		for them.
6980		For example, query parsing should be moved into the kernel.
6981*/
6982static int
6983query_open(dev_t device, const char* query, uint32 flags, port_id port,
6984	int32 token, bool kernel)
6985{
6986	struct fs_mount* mount;
6987	void* cookie;
6988
6989	FUNCTION(("query_open(device = %ld, query = \"%s\", kernel = %d)\n", device,
6990		query, kernel));
6991
6992	status_t status = get_mount(device, &mount);
6993	if (status != B_OK)
6994		return status;
6995
6996	if (!HAS_FS_MOUNT_CALL(mount, open_query)) {
6997		status = B_UNSUPPORTED;
6998		goto error;
6999	}
7000
7001	status = FS_MOUNT_CALL(mount, open_query, query, flags, port, token,
7002		&cookie);
7003	if (status != B_OK)
7004		goto error;
7005
7006	// get fd for the index directory
7007	int fd;
7008	fd = get_new_fd(FDTYPE_QUERY, mount, NULL, cookie, O_CLOEXEC, kernel);
7009	if (fd >= 0)
7010		return fd;
7011
7012	status = fd;
7013
7014	// something went wrong
7015	FS_MOUNT_CALL(mount, close_query, cookie);
7016	FS_MOUNT_CALL(mount, free_query_cookie, cookie);
7017
7018error:
7019	put_mount(mount);
7020	return status;
7021}
7022
7023
7024static status_t
7025query_close(struct file_descriptor* descriptor)
7026{
7027	struct fs_mount* mount = descriptor->u.mount;
7028
7029	FUNCTION(("query_close(descriptor = %p)\n", descriptor));
7030
7031	if (HAS_FS_MOUNT_CALL(mount, close_query))
7032		return FS_MOUNT_CALL(mount, close_query, descriptor->cookie);
7033
7034	return B_OK;
7035}
7036
7037
7038static void
7039query_free_fd(struct file_descriptor* descriptor)
7040{
7041	struct fs_mount* mount = descriptor->u.mount;
7042
7043	if (mount != NULL) {
7044		FS_MOUNT_CALL(mount, free_query_cookie, descriptor->cookie);
7045		put_mount(mount);
7046	}
7047}
7048
7049
7050static status_t
7051query_read(struct io_context* ioContext, struct file_descriptor* descriptor,
7052	struct dirent* buffer, size_t bufferSize, uint32* _count)
7053{
7054	struct fs_mount* mount = descriptor->u.mount;
7055
7056	if (HAS_FS_MOUNT_CALL(mount, read_query)) {
7057		return FS_MOUNT_CALL(mount, read_query, descriptor->cookie, buffer,
7058			bufferSize, _count);
7059	}
7060
7061	return B_UNSUPPORTED;
7062}
7063
7064
7065static status_t
7066query_rewind(struct file_descriptor* descriptor)
7067{
7068	struct fs_mount* mount = descriptor->u.mount;
7069
7070	if (HAS_FS_MOUNT_CALL(mount, rewind_query))
7071		return FS_MOUNT_CALL(mount, rewind_query, descriptor->cookie);
7072
7073	return B_UNSUPPORTED;
7074}
7075
7076
7077//	#pragma mark - General File System functions
7078
7079
7080static dev_t
7081fs_mount(char* path, const char* device, const char* fsName, uint32 flags,
7082	const char* args, bool kernel)
7083{
7084	struct ::fs_mount* mount;
7085	status_t status = B_OK;
7086	fs_volume* volume = NULL;
7087	int32 layer = 0;
7088	Vnode* coveredNode = NULL;
7089
7090	FUNCTION(("fs_mount: entry. path = '%s', fs_name = '%s'\n", path, fsName));
7091
7092	// The path is always safe, we just have to make sure that fsName is
7093	// almost valid - we can't make any assumptions about args, though.
7094	// A NULL fsName is OK, if a device was given and the FS is not virtual.
7095	// We'll get it from the DDM later.
7096	if (fsName == NULL) {
7097		if (!device || flags & B_MOUNT_VIRTUAL_DEVICE)
7098			return B_BAD_VALUE;
7099	} else if (fsName[0] == '\0')
7100		return B_BAD_VALUE;
7101
7102	RecursiveLocker mountOpLocker(sMountOpLock);
7103
7104	// Helper to delete a newly created file device on failure.
7105	// Not exactly beautiful, but helps to keep the code below cleaner.
7106	struct FileDeviceDeleter {
7107		FileDeviceDeleter() : id(-1) {}
7108		~FileDeviceDeleter()
7109		{
7110			KDiskDeviceManager::Default()->DeleteFileDevice(id);
7111		}
7112
7113		partition_id id;
7114	} fileDeviceDeleter;
7115
7116	// If the file system is not a "virtual" one, the device argument should
7117	// point to a real file/device (if given at all).
7118	// get the partition
7119	KDiskDeviceManager* ddm = KDiskDeviceManager::Default();
7120	KPartition* partition = NULL;
7121	KPath normalizedDevice;
7122	bool newlyCreatedFileDevice = false;
7123
7124	if (!(flags & B_MOUNT_VIRTUAL_DEVICE) && device != NULL) {
7125		// normalize the device path
7126		status = normalizedDevice.SetTo(device, true);
7127		if (status != B_OK)
7128			return status;
7129
7130		// get a corresponding partition from the DDM
7131		partition = ddm->RegisterPartition(normalizedDevice.Path());
7132		if (partition == NULL) {
7133			// Partition not found: This either means, the user supplied
7134			// an invalid path, or the path refers to an image file. We try
7135			// to let the DDM create a file device for the path.
7136			partition_id deviceID = ddm->CreateFileDevice(
7137				normalizedDevice.Path(), &newlyCreatedFileDevice);
7138			if (deviceID >= 0) {
7139				partition = ddm->RegisterPartition(deviceID);
7140				if (newlyCreatedFileDevice)
7141					fileDeviceDeleter.id = deviceID;
7142			}
7143		}
7144
7145		if (!partition) {
7146			TRACE(("fs_mount(): Partition `%s' not found.\n",
7147				normalizedDevice.Path()));
7148			return B_ENTRY_NOT_FOUND;
7149		}
7150
7151		device = normalizedDevice.Path();
7152			// correct path to file device
7153	}
7154	PartitionRegistrar partitionRegistrar(partition, true);
7155
7156	// Write lock the partition's device. For the time being, we keep the lock
7157	// until we're done mounting -- not nice, but ensure, that no-one is
7158	// interfering.
7159	// TODO: Just mark the partition busy while mounting!
7160	KDiskDevice* diskDevice = NULL;
7161	if (partition) {
7162		diskDevice = ddm->WriteLockDevice(partition->Device()->ID());
7163		if (!diskDevice) {
7164			TRACE(("fs_mount(): Failed to lock disk device!\n"));
7165			return B_ERROR;
7166		}
7167	}
7168
7169	DeviceWriteLocker writeLocker(diskDevice, true);
7170		// this takes over the write lock acquired before
7171
7172	if (partition != NULL) {
7173		// make sure, that the partition is not busy
7174		if (partition->IsBusy()) {
7175			TRACE(("fs_mount(): Partition is busy.\n"));
7176			return B_BUSY;
7177		}
7178
7179		// if no FS name had been supplied, we get it from the partition
7180		if (fsName == NULL) {
7181			KDiskSystem* diskSystem = partition->DiskSystem();
7182			if (!diskSystem) {
7183				TRACE(("fs_mount(): No FS name was given, and the DDM didn't "
7184					"recognize it.\n"));
7185				return B_BAD_VALUE;
7186			}
7187
7188			if (!diskSystem->IsFileSystem()) {
7189				TRACE(("fs_mount(): No FS name was given, and the DDM found a "
7190					"partitioning system.\n"));
7191				return B_BAD_VALUE;
7192			}
7193
7194			// The disk system name will not change, and the KDiskSystem
7195			// object will not go away while the disk device is locked (and
7196			// the partition has a reference to it), so this is safe.
7197			fsName = diskSystem->Name();
7198		}
7199	}
7200
7201	mount = new(std::nothrow) (struct ::fs_mount);
7202	if (mount == NULL)
7203		return B_NO_MEMORY;
7204
7205	mount->device_name = strdup(device);
7206		// "device" can be NULL
7207
7208	status = mount->entry_cache.Init();
7209	if (status != B_OK)
7210		goto err1;
7211
7212	// initialize structure
7213	mount->id = sNextMountID++;
7214	mount->partition = NULL;
7215	mount->root_vnode = NULL;
7216	mount->covers_vnode = NULL;
7217	mount->unmounting = false;
7218	mount->owns_file_device = false;
7219	mount->volume = NULL;
7220
7221	// build up the volume(s)
7222	while (true) {
7223		char* layerFSName = get_file_system_name_for_layer(fsName, layer);
7224		if (layerFSName == NULL) {
7225			if (layer == 0) {
7226				status = B_NO_MEMORY;
7227				goto err1;
7228			}
7229
7230			break;
7231		}
7232
7233		volume = (fs_volume*)malloc(sizeof(fs_volume));
7234		if (volume == NULL) {
7235			status = B_NO_MEMORY;
7236			free(layerFSName);
7237			goto err1;
7238		}
7239
7240		volume->id = mount->id;
7241		volume->partition = partition != NULL ? partition->ID() : -1;
7242		volume->layer = layer++;
7243		volume->private_volume = NULL;
7244		volume->ops = NULL;
7245		volume->sub_volume = NULL;
7246		volume->super_volume = NULL;
7247		volume->file_system = NULL;
7248		volume->file_system_name = NULL;
7249
7250		volume->file_system_name = get_file_system_name(layerFSName);
7251		if (volume->file_system_name == NULL) {
7252			status = B_NO_MEMORY;
7253			free(layerFSName);
7254			free(volume);
7255			goto err1;
7256		}
7257
7258		volume->file_system = get_file_system(layerFSName);
7259		if (volume->file_system == NULL) {
7260			status = B_DEVICE_NOT_FOUND;
7261			free(layerFSName);
7262			free(volume->file_system_name);
7263			free(volume);
7264			goto err1;
7265		}
7266
7267		if (mount->volume == NULL)
7268			mount->volume = volume;
7269		else {
7270			volume->super_volume = mount->volume;
7271			mount->volume->sub_volume = volume;
7272			mount->volume = volume;
7273		}
7274	}
7275
7276	// insert mount struct into list before we call FS's mount() function
7277	// so that vnodes can be created for this mount
7278	mutex_lock(&sMountMutex);
7279	hash_insert(sMountsTable, mount);
7280	mutex_unlock(&sMountMutex);
7281
7282	ino_t rootID;
7283
7284	if (!sRoot) {
7285		// we haven't mounted anything yet
7286		if (strcmp(path, "/") != 0) {
7287			status = B_ERROR;
7288			goto err2;
7289		}
7290
7291		status = mount->volume->file_system->mount(mount->volume, device, flags,
7292			args, &rootID);
7293		if (status != 0)
7294			goto err2;
7295	} else {
7296		status = path_to_vnode(path, true, &coveredNode, NULL, kernel);
7297		if (status != B_OK)
7298			goto err2;
7299
7300		mount->covers_vnode = coveredNode;
7301
7302		// make sure covered_vnode is a directory
7303		if (!S_ISDIR(coveredNode->Type())) {
7304			status = B_NOT_A_DIRECTORY;
7305			goto err3;
7306		}
7307
7308		if (coveredNode->IsCovered()) {
7309			// this is already a covered vnode
7310			status = B_BUSY;
7311			goto err3;
7312		}
7313
7314		// mount it/them
7315		fs_volume* volume = mount->volume;
7316		while (volume) {
7317			status = volume->file_system->mount(volume, device, flags, args,
7318				&rootID);
7319			if (status != B_OK) {
7320				if (volume->sub_volume)
7321					goto err4;
7322				goto err3;
7323			}
7324
7325			volume = volume->super_volume;
7326		}
7327
7328		volume = mount->volume;
7329		while (volume) {
7330			if (volume->ops->all_layers_mounted != NULL)
7331				volume->ops->all_layers_mounted(volume);
7332			volume = volume->super_volume;
7333		}
7334	}
7335
7336	// the root node is supposed to be owned by the file system - it must
7337	// exist at this point
7338	mount->root_vnode = lookup_vnode(mount->id, rootID);
7339	if (mount->root_vnode == NULL || mount->root_vnode->ref_count != 1) {
7340		panic("fs_mount: file system does not own its root node!\n");
7341		status = B_ERROR;
7342		goto err4;
7343	}
7344
7345	// set up the links between the root vnode and the vnode it covers
7346	rw_lock_write_lock(&sVnodeLock);
7347	if (coveredNode != NULL) {
7348		if (coveredNode->IsCovered()) {
7349			// the vnode is covered now
7350			status = B_BUSY;
7351			rw_lock_write_unlock(&sVnodeLock);
7352			goto err4;
7353		}
7354
7355		mount->root_vnode->covers = coveredNode;
7356		mount->root_vnode->SetCovering(true);
7357
7358		coveredNode->covered_by = mount->root_vnode;
7359		coveredNode->SetCovered(true);
7360	}
7361	rw_lock_write_unlock(&sVnodeLock);
7362
7363	if (!sRoot) {
7364		sRoot = mount->root_vnode;
7365		mutex_lock(&sIOContextRootLock);
7366		get_current_io_context(true)->root = sRoot;
7367		mutex_unlock(&sIOContextRootLock);
7368		inc_vnode_ref_count(sRoot);
7369	}
7370
7371	// supply the partition (if any) with the mount cookie and mark it mounted
7372	if (partition) {
7373		partition->SetMountCookie(mount->volume->private_volume);
7374		partition->SetVolumeID(mount->id);
7375
7376		// keep a partition reference as long as the partition is mounted
7377		partitionRegistrar.Detach();
7378		mount->partition = partition;
7379		mount->owns_file_device = newlyCreatedFileDevice;
7380		fileDeviceDeleter.id = -1;
7381	}
7382
7383	notify_mount(mount->id,
7384		coveredNode != NULL ? coveredNode->device : -1,
7385		coveredNode ? coveredNode->id : -1);
7386
7387	return mount->id;
7388
7389err4:
7390	FS_MOUNT_CALL_NO_PARAMS(mount, unmount);
7391err3:
7392	if (coveredNode != NULL)
7393		put_vnode(coveredNode);
7394err2:
7395	mutex_lock(&sMountMutex);
7396	hash_remove(sMountsTable, mount);
7397	mutex_unlock(&sMountMutex);
7398err1:
7399	delete mount;
7400
7401	return status;
7402}
7403
7404
7405static status_t
7406fs_unmount(char* path, dev_t mountID, uint32 flags, bool kernel)
7407{
7408	struct fs_mount* mount;
7409	status_t err;
7410
7411	FUNCTION(("fs_unmount(path '%s', dev %ld, kernel %d\n", path, mountID,
7412		kernel));
7413
7414	struct vnode* pathVnode = NULL;
7415	if (path != NULL) {
7416		err = path_to_vnode(path, true, &pathVnode, NULL, kernel);
7417		if (err != B_OK)
7418			return B_ENTRY_NOT_FOUND;
7419	}
7420
7421	RecursiveLocker mountOpLocker(sMountOpLock);
7422
7423	// this lock is not strictly necessary, but here in case of KDEBUG
7424	// to keep the ASSERT in find_mount() working.
7425	KDEBUG_ONLY(mutex_lock(&sMountMutex));
7426	mount = find_mount(path != NULL ? pathVnode->device : mountID);
7427	KDEBUG_ONLY(mutex_unlock(&sMountMutex));
7428	if (mount == NULL) {
7429		panic("fs_unmount: find_mount() failed on root vnode @%p of mount\n",
7430			pathVnode);
7431	}
7432
7433	if (path != NULL) {
7434		put_vnode(pathVnode);
7435
7436		if (mount->root_vnode != pathVnode) {
7437			// not mountpoint
7438			return B_BAD_VALUE;
7439		}
7440	}
7441
7442	// if the volume is associated with a partition, lock the device of the
7443	// partition as long as we are unmounting
7444	KDiskDeviceManager* ddm = KDiskDeviceManager::Default();
7445	KPartition* partition = mount->partition;
7446	KDiskDevice* diskDevice = NULL;
7447	if (partition != NULL) {
7448		if (partition->Device() == NULL) {
7449			dprintf("fs_unmount(): There is no device!\n");
7450			return B_ERROR;
7451		}
7452		diskDevice = ddm->WriteLockDevice(partition->Device()->ID());
7453		if (!diskDevice) {
7454			TRACE(("fs_unmount(): Failed to lock disk device!\n"));
7455			return B_ERROR;
7456		}
7457	}
7458	DeviceWriteLocker writeLocker(diskDevice, true);
7459
7460	// make sure, that the partition is not busy
7461	if (partition != NULL) {
7462		if ((flags & B_UNMOUNT_BUSY_PARTITION) == 0 && partition->IsBusy()) {
7463			TRACE(("fs_unmount(): Partition is busy.\n"));
7464			return B_BUSY;
7465		}
7466	}
7467
7468	// grab the vnode master mutex to keep someone from creating
7469	// a vnode while we're figuring out if we can continue
7470	WriteLocker vnodesWriteLocker(&sVnodeLock);
7471
7472	bool disconnectedDescriptors = false;
7473
7474	while (true) {
7475		bool busy = false;
7476
7477		// cycle through the list of vnodes associated with this mount and
7478		// make sure all of them are not busy or have refs on them
7479		VnodeList::Iterator iterator = mount->vnodes.GetIterator();
7480		while (struct vnode* vnode = iterator.Next()) {
7481			if (vnode->IsBusy()) {
7482				busy = true;
7483				break;
7484			}
7485
7486			// check the vnode's ref count -- subtract additional references for
7487			// covering
7488			int32 refCount = vnode->ref_count;
7489			if (vnode->covers != NULL)
7490				refCount--;
7491			if (vnode->covered_by != NULL)
7492				refCount--;
7493
7494			if (refCount != 0) {
7495				// there are still vnodes in use on this mount, so we cannot
7496				// unmount yet
7497				busy = true;
7498				break;
7499			}
7500		}
7501
7502		if (!busy)
7503			break;
7504
7505		if ((flags & B_FORCE_UNMOUNT) == 0)
7506			return B_BUSY;
7507
7508		if (disconnectedDescriptors) {
7509			// wait a bit until the last access is finished, and then try again
7510			vnodesWriteLocker.Unlock();
7511			snooze(100000);
7512			// TODO: if there is some kind of bug that prevents the ref counts
7513			// from getting back to zero, this will fall into an endless loop...
7514			vnodesWriteLocker.Lock();
7515			continue;
7516		}
7517
7518		// the file system is still busy - but we're forced to unmount it,
7519		// so let's disconnect all open file descriptors
7520
7521		mount->unmounting = true;
7522			// prevent new vnodes from being created
7523
7524		vnodesWriteLocker.Unlock();
7525
7526		disconnect_mount_or_vnode_fds(mount, NULL);
7527		disconnectedDescriptors = true;
7528
7529		vnodesWriteLocker.Lock();
7530	}
7531
7532	// We can safely continue. Mark all of the vnodes busy and this mount
7533	// structure in unmounting state. Also undo the vnode covers/covered_by
7534	// links.
7535	mount->unmounting = true;
7536
7537	VnodeList::Iterator iterator = mount->vnodes.GetIterator();
7538	while (struct vnode* vnode = iterator.Next()) {
7539		// Remove all covers/covered_by links from other mounts' nodes to this
7540		// vnode and adjust the node ref count accordingly. We will release the
7541		// references to the external vnodes below.
7542		if (Vnode* coveredNode = vnode->covers) {
7543			if (Vnode* coveringNode = vnode->covered_by) {
7544				// We have both covered and covering vnodes, so just remove us
7545				// from the chain.
7546				coveredNode->covered_by = coveringNode;
7547				coveringNode->covers = coveredNode;
7548				vnode->ref_count -= 2;
7549
7550				vnode->covered_by = NULL;
7551				vnode->covers = NULL;
7552				vnode->SetCovering(false);
7553				vnode->SetCovered(false);
7554			} else {
7555				// We only have a covered vnode. Remove its link to us.
7556				coveredNode->covered_by = NULL;
7557				coveredNode->SetCovered(false);
7558				vnode->ref_count--;
7559
7560				// If the other node is an external vnode, we keep its link
7561				// link around so we can put the reference later on. Otherwise
7562				// we get rid of it right now.
7563				if (coveredNode->mount == mount) {
7564					vnode->covers = NULL;
7565					coveredNode->ref_count--;
7566				}
7567			}
7568		} else if (Vnode* coveringNode = vnode->covered_by) {
7569			// We only have a covering vnode. Remove its link to us.
7570			coveringNode->covers = NULL;
7571			coveringNode->SetCovering(false);
7572			vnode->ref_count--;
7573
7574			// If the other node is an external vnode, we keep its link
7575			// link around so we can put the reference later on. Otherwise
7576			// we get rid of it right now.
7577			if (coveringNode->mount == mount) {
7578				vnode->covered_by = NULL;
7579				coveringNode->ref_count--;
7580			}
7581		}
7582
7583		vnode->SetBusy(true);
7584		vnode_to_be_freed(vnode);
7585	}
7586
7587	vnodesWriteLocker.Unlock();
7588
7589	// Free all vnodes associated with this mount.
7590	// They will be removed from the mount list by free_vnode(), so
7591	// we don't have to do this.
7592	while (struct vnode* vnode = mount->vnodes.Head()) {
7593		// Put the references to external covered/covering vnodes we kept above.
7594		if (Vnode* coveredNode = vnode->covers)
7595			put_vnode(coveredNode);
7596		if (Vnode* coveringNode = vnode->covered_by)
7597			put_vnode(coveringNode);
7598
7599		free_vnode(vnode, false);
7600	}
7601
7602	// remove the mount structure from the hash table
7603	mutex_lock(&sMountMutex);
7604	hash_remove(sMountsTable, mount);
7605	mutex_unlock(&sMountMutex);
7606
7607	mountOpLocker.Unlock();
7608
7609	FS_MOUNT_CALL_NO_PARAMS(mount, unmount);
7610	notify_unmount(mount->id);
7611
7612	// dereference the partition and mark it unmounted
7613	if (partition) {
7614		partition->SetVolumeID(-1);
7615		partition->SetMountCookie(NULL);
7616
7617		if (mount->owns_file_device)
7618			KDiskDeviceManager::Default()->DeleteFileDevice(partition->ID());
7619		partition->Unregister();
7620	}
7621
7622	delete mount;
7623	return B_OK;
7624}
7625
7626
7627static status_t
7628fs_sync(dev_t device)
7629{
7630	struct fs_mount* mount;
7631	status_t status = get_mount(device, &mount);
7632	if (status != B_OK)
7633		return status;
7634
7635	struct vnode marker;
7636	memset(&marker, 0, sizeof(marker));
7637	marker.SetBusy(true);
7638	marker.SetRemoved(true);
7639
7640	// First, synchronize all file caches
7641
7642	while (true) {
7643		WriteLocker locker(sVnodeLock);
7644			// Note: That's the easy way. Which is probably OK for sync(),
7645			// since it's a relatively rare call and doesn't need to allow for
7646			// a lot of concurrency. Using a read lock would be possible, but
7647			// also more involved, since we had to lock the individual nodes
7648			// and take care of the locking order, which we might not want to
7649			// do while holding fs_mount::rlock.
7650
7651		// synchronize access to vnode list
7652		recursive_lock_lock(&mount->rlock);
7653
7654		struct vnode* vnode;
7655		if (!marker.IsRemoved()) {
7656			vnode = mount->vnodes.GetNext(&marker);
7657			mount->vnodes.Remove(&marker);
7658			marker.SetRemoved(true);
7659		} else
7660			vnode = mount->vnodes.First();
7661
7662		while (vnode != NULL && (vnode->cache == NULL
7663			|| vnode->IsRemoved() || vnode->IsBusy())) {
7664			// TODO: we could track writes (and writable mapped vnodes)
7665			//	and have a simple flag that we could test for here
7666			vnode = mount->vnodes.GetNext(vnode);
7667		}
7668
7669		if (vnode != NULL) {
7670			// insert marker vnode again
7671			mount->vnodes.Insert(mount->vnodes.GetNext(vnode), &marker);
7672			marker.SetRemoved(false);
7673		}
7674
7675		recursive_lock_unlock(&mount->rlock);
7676
7677		if (vnode == NULL)
7678			break;
7679
7680		vnode = lookup_vnode(mount->id, vnode->id);
7681		if (vnode == NULL || vnode->IsBusy())
7682			continue;
7683
7684		if (vnode->ref_count == 0) {
7685			// this vnode has been unused before
7686			vnode_used(vnode);
7687		}
7688		inc_vnode_ref_count(vnode);
7689
7690		locker.Unlock();
7691
7692		if (vnode->cache != NULL && !vnode->IsRemoved())
7693			vnode->cache->WriteModified();
7694
7695		put_vnode(vnode);
7696	}
7697
7698	// And then, let the file systems do their synchronizing work
7699
7700	if (HAS_FS_MOUNT_CALL(mount, sync))
7701		status = FS_MOUNT_CALL_NO_PARAMS(mount, sync);
7702
7703	put_mount(mount);
7704	return status;
7705}
7706
7707
7708static status_t
7709fs_read_info(dev_t device, struct fs_info* info)
7710{
7711	struct fs_mount* mount;
7712	status_t status = get_mount(device, &mount);
7713	if (status != B_OK)
7714		return status;
7715
7716	memset(info, 0, sizeof(struct fs_info));
7717
7718	if (HAS_FS_MOUNT_CALL(mount, read_fs_info))
7719		status = FS_MOUNT_CALL(mount, read_fs_info, info);
7720
7721	// fill in info the file system doesn't (have to) know about
7722	if (status == B_OK) {
7723		info->dev = mount->id;
7724		info->root = mount->root_vnode->id;
7725
7726		fs_volume* volume = mount->volume;
7727		while (volume->super_volume != NULL)
7728			volume = volume->super_volume;
7729
7730		strlcpy(info->fsh_name, volume->file_system_name,
7731			sizeof(info->fsh_name));
7732		if (mount->device_name != NULL) {
7733			strlcpy(info->device_name, mount->device_name,
7734				sizeof(info->device_name));
7735		}
7736	}
7737
7738	// if the call is not supported by the file system, there are still
7739	// the parts that we filled out ourselves
7740
7741	put_mount(mount);
7742	return status;
7743}
7744
7745
7746static status_t
7747fs_write_info(dev_t device, const struct fs_info* info, int mask)
7748{
7749	struct fs_mount* mount;
7750	status_t status = get_mount(device, &mount);
7751	if (status != B_OK)
7752		return status;
7753
7754	if (HAS_FS_MOUNT_CALL(mount, write_fs_info))
7755		status = FS_MOUNT_CALL(mount, write_fs_info, info, mask);
7756	else
7757		status = B_READ_ONLY_DEVICE;
7758
7759	put_mount(mount);
7760	return status;
7761}
7762
7763
7764static dev_t
7765fs_next_device(int32* _cookie)
7766{
7767	struct fs_mount* mount = NULL;
7768	dev_t device = *_cookie;
7769
7770	mutex_lock(&sMountMutex);
7771
7772	// Since device IDs are assigned sequentially, this algorithm
7773	// does work good enough. It makes sure that the device list
7774	// returned is sorted, and that no device is skipped when an
7775	// already visited device got unmounted.
7776
7777	while (device < sNextMountID) {
7778		mount = find_mount(device++);
7779		if (mount != NULL && mount->volume->private_volume != NULL)
7780			break;
7781	}
7782
7783	*_cookie = device;
7784
7785	if (mount != NULL)
7786		device = mount->id;
7787	else
7788		device = B_BAD_VALUE;
7789
7790	mutex_unlock(&sMountMutex);
7791
7792	return device;
7793}
7794
7795
7796ssize_t
7797fs_read_attr(int fd, const char *attribute, uint32 type, off_t pos,
7798	void *buffer, size_t readBytes)
7799{
7800	int attrFD = attr_open(fd, NULL, attribute, O_RDONLY, true);
7801	if (attrFD < 0)
7802		return attrFD;
7803
7804	ssize_t bytesRead = _kern_read(attrFD, pos, buffer, readBytes);
7805
7806	_kern_close(attrFD);
7807
7808	return bytesRead;
7809}
7810
7811
7812static status_t
7813get_cwd(char* buffer, size_t size, bool kernel)
7814{
7815	// Get current working directory from io context
7816	struct io_context* context = get_current_io_context(kernel);
7817	status_t status;
7818
7819	FUNCTION(("vfs_get_cwd: buf %p, size %ld\n", buffer, size));
7820
7821	mutex_lock(&context->io_mutex);
7822
7823	struct vnode* vnode = context->cwd;
7824	if (vnode)
7825		inc_vnode_ref_count(vnode);
7826
7827	mutex_unlock(&context->io_mutex);
7828
7829	if (vnode) {
7830		status = dir_vnode_to_path(vnode, buffer, size, kernel);
7831		put_vnode(vnode);
7832	} else
7833		status = B_ERROR;
7834
7835	return status;
7836}
7837
7838
7839static status_t
7840set_cwd(int fd, char* path, bool kernel)
7841{
7842	struct io_context* context;
7843	struct vnode* vnode = NULL;
7844	struct vnode* oldDirectory;
7845	status_t status;
7846
7847	FUNCTION(("set_cwd: path = \'%s\'\n", path));
7848
7849	// Get vnode for passed path, and bail if it failed
7850	status = fd_and_path_to_vnode(fd, path, true, &vnode, NULL, kernel);
7851	if (status < 0)
7852		return status;
7853
7854	if (!S_ISDIR(vnode->Type())) {
7855		// nope, can't cwd to here
7856		status = B_NOT_A_DIRECTORY;
7857		goto err;
7858	}
7859
7860	// Get current io context and lock
7861	context = get_current_io_context(kernel);
7862	mutex_lock(&context->io_mutex);
7863
7864	// save the old current working directory first
7865	oldDirectory = context->cwd;
7866	context->cwd = vnode;
7867
7868	mutex_unlock(&context->io_mutex);
7869
7870	if (oldDirectory)
7871		put_vnode(oldDirectory);
7872
7873	return B_NO_ERROR;
7874
7875err:
7876	put_vnode(vnode);
7877	return status;
7878}
7879
7880
7881//	#pragma mark - kernel mirrored syscalls
7882
7883
7884dev_t
7885_kern_mount(const char* path, const char* device, const char* fsName,
7886	uint32 flags, const char* args, size_t argsLength)
7887{
7888	KPath pathBuffer(path, false, B_PATH_NAME_LENGTH + 1);
7889	if (pathBuffer.InitCheck() != B_OK)
7890		return B_NO_MEMORY;
7891
7892	return fs_mount(pathBuffer.LockBuffer(), device, fsName, flags, args, true);
7893}
7894
7895
7896status_t
7897_kern_unmount(const char* path, uint32 flags)
7898{
7899	KPath pathBuffer(path, false, B_PATH_NAME_LENGTH + 1);
7900	if (pathBuffer.InitCheck() != B_OK)
7901		return B_NO_MEMORY;
7902
7903	return fs_unmount(pathBuffer.LockBuffer(), -1, flags, true);
7904}
7905
7906
7907status_t
7908_kern_read_fs_info(dev_t device, struct fs_info* info)
7909{
7910	if (info == NULL)
7911		return B_BAD_VALUE;
7912
7913	return fs_read_info(device, info);
7914}
7915
7916
7917status_t
7918_kern_write_fs_info(dev_t device, const struct fs_info* info, int mask)
7919{
7920	if (info == NULL)
7921		return B_BAD_VALUE;
7922
7923	return fs_write_info(device, info, mask);
7924}
7925
7926
7927status_t
7928_kern_sync(void)
7929{
7930	// Note: _kern_sync() is also called from _user_sync()
7931	int32 cookie = 0;
7932	dev_t device;
7933	while ((device = next_dev(&cookie)) >= 0) {
7934		status_t status = fs_sync(device);
7935		if (status != B_OK && status != B_BAD_VALUE) {
7936			dprintf("sync: device %" B_PRIdDEV " couldn't sync: %s\n", device,
7937				strerror(status));
7938		}
7939	}
7940
7941	return B_OK;
7942}
7943
7944
7945dev_t
7946_kern_next_device(int32* _cookie)
7947{
7948	return fs_next_device(_cookie);
7949}
7950
7951
7952status_t
7953_kern_get_next_fd_info(team_id teamID, uint32* _cookie, fd_info* info,
7954	size_t infoSize)
7955{
7956	if (infoSize != sizeof(fd_info))
7957		return B_BAD_VALUE;
7958
7959	// get the team
7960	Team* team = Team::Get(teamID);
7961	if (team == NULL)
7962		return B_BAD_TEAM_ID;
7963	BReference<Team> teamReference(team, true);
7964
7965	// now that we have a team reference, its I/O context won't go away
7966	io_context* context = team->io_context;
7967	MutexLocker contextLocker(context->io_mutex);
7968
7969	uint32 slot = *_cookie;
7970
7971	struct file_descriptor* descriptor;
7972	while (slot < context->table_size
7973		&& (descriptor = context->fds[slot]) == NULL) {
7974		slot++;
7975	}
7976
7977	if (slot >= context->table_size)
7978		return B_ENTRY_NOT_FOUND;
7979
7980	info->number = slot;
7981	info->open_mode = descriptor->open_mode;
7982
7983	struct vnode* vnode = fd_vnode(descriptor);
7984	if (vnode != NULL) {
7985		info->device = vnode->device;
7986		info->node = vnode->id;
7987	} else if (descriptor->u.mount != NULL) {
7988		info->device = descriptor->u.mount->id;
7989		info->node = -1;
7990	}
7991
7992	*_cookie = slot + 1;
7993	return B_OK;
7994}
7995
7996
7997int
7998_kern_open_entry_ref(dev_t device, ino_t inode, const char* name, int openMode,
7999	int perms)
8000{
8001	if ((openMode & O_CREAT) != 0) {
8002		return file_create_entry_ref(device, inode, name, openMode, perms,
8003			true);
8004	}
8005
8006	return file_open_entry_ref(device, inode, name, openMode, true);
8007}
8008
8009
8010/*!	\brief Opens a node specified by a FD + path pair.
8011
8012	At least one of \a fd and \a path must be specified.
8013	If only \a fd is given, the function opens the node identified by this
8014	FD. If only a path is given, this path is opened. If both are given and
8015	the path is absolute, \a fd is ignored; a relative path is reckoned off
8016	of the directory (!) identified by \a fd.
8017
8018	\param fd The FD. May be < 0.
8019	\param path The absolute or relative path. May be \c NULL.
8020	\param openMode The open mode.
8021	\return A FD referring to the newly opened node, or an error code,
8022			if an error occurs.
8023*/
8024int
8025_kern_open(int fd, const char* path, int openMode, int perms)
8026{
8027	KPath pathBuffer(path, false, B_PATH_NAME_LENGTH + 1);
8028	if (pathBuffer.InitCheck() != B_OK)
8029		return B_NO_MEMORY;
8030
8031	if (openMode & O_CREAT)
8032		return file_create(fd, pathBuffer.LockBuffer(), openMode, perms, true);
8033
8034	return file_open(fd, pathBuffer.LockBuffer(), openMode, true);
8035}
8036
8037
8038/*!	\brief Opens a directory specified by entry_ref or node_ref.
8039
8040	The supplied name may be \c NULL, in which case directory identified
8041	by \a device and \a inode will be opened. Otherwise \a device and
8042	\a inode identify the parent directory of the directory to be opened
8043	and \a name its entry name.
8044
8045	\param device If \a name is specified the ID of the device the parent
8046		   directory of the directory to be opened resides on, otherwise
8047		   the device of the directory itself.
8048	\param inode If \a name is specified the node ID of the parent
8049		   directory of the directory to be opened, otherwise node ID of the
8050		   directory itself.
8051	\param name The entry name of the directory to be opened. If \c NULL,
8052		   the \a device + \a inode pair identify the node to be opened.
8053	\return The FD of the newly opened directory or an error code, if
8054			something went wrong.
8055*/
8056int
8057_kern_open_dir_entry_ref(dev_t device, ino_t inode, const char* name)
8058{
8059	return dir_open_entry_ref(device, inode, name, true);
8060}
8061
8062
8063/*!	\brief Opens a directory specified by a FD + path pair.
8064
8065	At least one of \a fd and \a path must be specified.
8066	If only \a fd is given, the function opens the directory identified by this
8067	FD. If only a path is given, this path is opened. If both are given and
8068	the path is absolute, \a fd is ignored; a relative path is reckoned off
8069	of the directory (!) identified by \a fd.
8070
8071	\param fd The FD. May be < 0.
8072	\param path The absolute or relative path. May be \c NULL.
8073	\return A FD referring to the newly opened directory, or an error code,
8074			if an error occurs.
8075*/
8076int
8077_kern_open_dir(int fd, const char* path)
8078{
8079	KPath pathBuffer(path, false, B_PATH_NAME_LENGTH + 1);
8080	if (pathBuffer.InitCheck() != B_OK)
8081		return B_NO_MEMORY;
8082
8083	return dir_open(fd, pathBuffer.LockBuffer(), true);
8084}
8085
8086
8087status_t
8088_kern_fcntl(int fd, int op, size_t argument)
8089{
8090	return common_fcntl(fd, op, argument, true);
8091}
8092
8093
8094status_t
8095_kern_fsync(int fd)
8096{
8097	return common_sync(fd, true);
8098}
8099
8100
8101status_t
8102_kern_lock_node(int fd)
8103{
8104	return common_lock_node(fd, true);
8105}
8106
8107
8108status_t
8109_kern_unlock_node(int fd)
8110{
8111	return common_unlock_node(fd, true);
8112}
8113
8114
8115status_t
8116_kern_create_dir_entry_ref(dev_t device, ino_t inode, const char* name,
8117	int perms)
8118{
8119	return dir_create_entry_ref(device, inode, name, perms, true);
8120}
8121
8122
8123/*!	\brief Creates a directory specified by a FD + path pair.
8124
8125	\a path must always be specified (it contains the name of the new directory
8126	at least). If only a path is given, this path identifies the location at
8127	which the directory shall be created. If both \a fd and \a path are given
8128	and the path is absolute, \a fd is ignored; a relative path is reckoned off
8129	of the directory (!) identified by \a fd.
8130
8131	\param fd The FD. May be < 0.
8132	\param path The absolute or relative path. Must not be \c NULL.
8133	\param perms The access permissions the new directory shall have.
8134	\return \c B_OK, if the directory has been created successfully, another
8135			error code otherwise.
8136*/
8137status_t
8138_kern_create_dir(int fd, const char* path, int perms)
8139{
8140	KPath pathBuffer(path, false, B_PATH_NAME_LENGTH + 1);
8141	if (pathBuffer.InitCheck() != B_OK)
8142		return B_NO_MEMORY;
8143
8144	return dir_create(fd, pathBuffer.LockBuffer(), perms, true);
8145}
8146
8147
8148status_t
8149_kern_remove_dir(int fd, const char* path)
8150{
8151	if (path) {
8152		KPath pathBuffer(path, false, B_PATH_NAME_LENGTH + 1);
8153		if (pathBuffer.InitCheck() != B_OK)
8154			return B_NO_MEMORY;
8155
8156		return dir_remove(fd, pathBuffer.LockBuffer(), true);
8157	}
8158
8159	return dir_remove(fd, NULL, true);
8160}
8161
8162
8163/*!	\brief Reads the contents of a symlink referred to by a FD + path pair.
8164
8165	At least one of \a fd and \a path must be specified.
8166	If only \a fd is given, the function the symlink to be read is the node
8167	identified by this FD. If only a path is given, this path identifies the
8168	symlink to be read. If both are given and the path is absolute, \a fd is
8169	ignored; a relative path is reckoned off of the directory (!) identified
8170	by \a fd.
8171	If this function fails with B_BUFFER_OVERFLOW, the \a _bufferSize pointer
8172	will still be updated to reflect the required buffer size.
8173
8174	\param fd The FD. May be < 0.
8175	\param path The absolute or relative path. May be \c NULL.
8176	\param buffer The buffer into which the contents of the symlink shall be
8177		   written.
8178	\param _bufferSize A pointer to the size of the supplied buffer.
8179	\return The length of the link on success or an appropriate error code
8180*/
8181status_t
8182_kern_read_link(int fd, const char* path, char* buffer, size_t* _bufferSize)
8183{
8184	if (path) {
8185		KPath pathBuffer(path, false, B_PATH_NAME_LENGTH + 1);
8186		if (pathBuffer.InitCheck() != B_OK)
8187			return B_NO_MEMORY;
8188
8189		return common_read_link(fd, pathBuffer.LockBuffer(),
8190			buffer, _bufferSize, true);
8191	}
8192
8193	return common_read_link(fd, NULL, buffer, _bufferSize, true);
8194}
8195
8196
8197/*!	\brief Creates a symlink specified by a FD + path pair.
8198
8199	\a path must always be specified (it contains the name of the new symlink
8200	at least). If only a path is given, this path identifies the location at
8201	which the symlink shall be created. If both \a fd and \a path are given and
8202	the path is absolute, \a fd is ignored; a relative path is reckoned off
8203	of the directory (!) identified by \a fd.
8204
8205	\param fd The FD. May be < 0.
8206	\param toPath The absolute or relative path. Must not be \c NULL.
8207	\param mode The access permissions the new symlink shall have.
8208	\return \c B_OK, if the symlink has been created successfully, another
8209			error code otherwise.
8210*/
8211status_t
8212_kern_create_symlink(int fd, const char* path, const char* toPath, int mode)
8213{
8214	KPath pathBuffer(path, false, B_PATH_NAME_LENGTH + 1);
8215	if (pathBuffer.InitCheck() != B_OK)
8216		return B_NO_MEMORY;
8217
8218	return common_create_symlink(fd, pathBuffer.LockBuffer(),
8219		toPath, mode, true);
8220}
8221
8222
8223status_t
8224_kern_create_link(int pathFD, const char* path, int toFD, const char* toPath,
8225	bool traverseLeafLink)
8226{
8227	KPath pathBuffer(path, false, B_PATH_NAME_LENGTH + 1);
8228	KPath toPathBuffer(toPath, false, B_PATH_NAME_LENGTH + 1);
8229	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
8230		return B_NO_MEMORY;
8231
8232	return common_create_link(pathFD, pathBuffer.LockBuffer(), toFD,
8233		toPathBuffer.LockBuffer(), traverseLeafLink, true);
8234}
8235
8236
8237/*!	\brief Removes an entry specified by a FD + path pair from its directory.
8238
8239	\a path must always be specified (it contains at least the name of the entry
8240	to be deleted). If only a path is given, this path identifies the entry
8241	directly. If both \a fd and \a path are given and the path is absolute,
8242	\a fd is ignored; a relative path is reckoned off of the directory (!)
8243	identified by \a fd.
8244
8245	\param fd The FD. May be < 0.
8246	\param path The absolute or relative path. Must not be \c NULL.
8247	\return \c B_OK, if the entry has been removed successfully, another
8248			error code otherwise.
8249*/
8250status_t
8251_kern_unlink(int fd, const char* path)
8252{
8253	KPath pathBuffer(path, false, B_PATH_NAME_LENGTH + 1);
8254	if (pathBuffer.InitCheck() != B_OK)
8255		return B_NO_MEMORY;
8256
8257	return common_unlink(fd, pathBuffer.LockBuffer(), true);
8258}
8259
8260
8261/*!	\brief Moves an entry specified by a FD + path pair to a an entry specified
8262		   by another FD + path pair.
8263
8264	\a oldPath and \a newPath must always be specified (they contain at least
8265	the name of the entry). If only a path is given, this path identifies the
8266	entry directly. If both a FD and a path are given and the path is absolute,
8267	the FD is ignored; a relative path is reckoned off of the directory (!)
8268	identified by the respective FD.
8269
8270	\param oldFD The FD of the old location. May be < 0.
8271	\param oldPath The absolute or relative path of the old location. Must not
8272		   be \c NULL.
8273	\param newFD The FD of the new location. May be < 0.
8274	\param newPath The absolute or relative path of the new location. Must not
8275		   be \c NULL.
8276	\return \c B_OK, if the entry has been moved successfully, another
8277			error code otherwise.
8278*/
8279status_t
8280_kern_rename(int oldFD, const char* oldPath, int newFD, const char* newPath)
8281{
8282	KPath oldPathBuffer(oldPath, false, B_PATH_NAME_LENGTH + 1);
8283	KPath newPathBuffer(newPath, false, B_PATH_NAME_LENGTH + 1);
8284	if (oldPathBuffer.InitCheck() != B_OK || newPathBuffer.InitCheck() != B_OK)
8285		return B_NO_MEMORY;
8286
8287	return common_rename(oldFD, oldPathBuffer.LockBuffer(),
8288		newFD, newPathBuffer.LockBuffer(), true);
8289}
8290
8291
8292status_t
8293_kern_access(int fd, const char* path, int mode, bool effectiveUserGroup)
8294{
8295	KPath pathBuffer(path, false, B_PATH_NAME_LENGTH + 1);
8296	if (pathBuffer.InitCheck() != B_OK)
8297		return B_NO_MEMORY;
8298
8299	return common_access(fd, pathBuffer.LockBuffer(), mode, effectiveUserGroup,
8300		true);
8301}
8302
8303
8304/*!	\brief Reads stat data of an entity specified by a FD + path pair.
8305
8306	If only \a fd is given, the stat operation associated with the type
8307	of the FD (node, attr, attr dir etc.) is performed. If only \a path is
8308	given, this path identifies the entry for whose node to retrieve the
8309	stat data. If both \a fd and \a path are given and the path is absolute,
8310	\a fd is ignored; a relative path is reckoned off of the directory (!)
8311	identified by \a fd and specifies the entry whose stat data shall be
8312	retrieved.
8313
8314	\param fd The FD. May be < 0.
8315	\param path The absolute or relative path. Must not be \c NULL.
8316	\param traverseLeafLink If \a path is given, \c true specifies that the
8317		   function shall not stick to symlinks, but traverse them.
8318	\param stat The buffer the stat data shall be written into.
8319	\param statSize The size of the supplied stat buffer.
8320	\return \c B_OK, if the the stat data have been read successfully, another
8321			error code otherwise.
8322*/
8323status_t
8324_kern_read_stat(int fd, const char* path, bool traverseLeafLink,
8325	struct stat* stat, size_t statSize)
8326{
8327	struct stat completeStat;
8328	struct stat* originalStat = NULL;
8329	status_t status;
8330
8331	if (statSize > sizeof(struct stat))
8332		return B_BAD_VALUE;
8333
8334	// this supports different stat extensions
8335	if (statSize < sizeof(struct stat)) {
8336		originalStat = stat;
8337		stat = &completeStat;
8338	}
8339
8340	status = vfs_read_stat(fd, path, traverseLeafLink, stat, true);
8341
8342	if (status == B_OK && originalStat != NULL)
8343		memcpy(originalStat, stat, statSize);
8344
8345	return status;
8346}
8347
8348
8349/*!	\brief Writes stat data of an entity specified by a FD + path pair.
8350
8351	If only \a fd is given, the stat operation associated with the type
8352	of the FD (node, attr, attr dir etc.) is performed. If only \a path is
8353	given, this path identifies the entry for whose node to write the
8354	stat data. If both \a fd and \a path are given and the path is absolute,
8355	\a fd is ignored; a relative path is reckoned off of the directory (!)
8356	identified by \a fd and specifies the entry whose stat data shall be
8357	written.
8358
8359	\param fd The FD. May be < 0.
8360	\param path The absolute or relative path. Must not be \c NULL.
8361	\param traverseLeafLink If \a path is given, \c true specifies that the
8362		   function shall not stick to symlinks, but traverse them.
8363	\param stat The buffer containing the stat data to be written.
8364	\param statSize The size of the supplied stat buffer.
8365	\param statMask A mask specifying which parts of the stat data shall be
8366		   written.
8367	\return \c B_OK, if the the stat data have been written successfully,
8368			another error code otherwise.
8369*/
8370status_t
8371_kern_write_stat(int fd, const char* path, bool traverseLeafLink,
8372	const struct stat* stat, size_t statSize, int statMask)
8373{
8374	struct stat completeStat;
8375
8376	if (statSize > sizeof(struct stat))
8377		return B_BAD_VALUE;
8378
8379	// this supports different stat extensions
8380	if (statSize < sizeof(struct stat)) {
8381		memset((uint8*)&completeStat + statSize, 0,
8382			sizeof(struct stat) - statSize);
8383		memcpy(&completeStat, stat, statSize);
8384		stat = &completeStat;
8385	}
8386
8387	status_t status;
8388
8389	if (path) {
8390		// path given: write the stat of the node referred to by (fd, path)
8391		KPath pathBuffer(path, false, B_PATH_NAME_LENGTH + 1);
8392		if (pathBuffer.InitCheck() != B_OK)
8393			return B_NO_MEMORY;
8394
8395		status = common_path_write_stat(fd, pathBuffer.LockBuffer(),
8396			traverseLeafLink, stat, statMask, true);
8397	} else {
8398		// no path given: get the FD and use the FD operation
8399		struct file_descriptor* descriptor
8400			= get_fd(get_current_io_context(true), fd);
8401		if (descriptor == NULL)
8402			return B_FILE_ERROR;
8403
8404		if (descriptor->ops->fd_write_stat)
8405			status = descriptor->ops->fd_write_stat(descriptor, stat, statMask);
8406		else
8407			status = B_UNSUPPORTED;
8408
8409		put_fd(descriptor);
8410	}
8411
8412	return status;
8413}
8414
8415
8416int
8417_kern_open_attr_dir(int fd, const char* path, bool traverseLeafLink)
8418{
8419	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
8420	if (pathBuffer.InitCheck() != B_OK)
8421		return B_NO_MEMORY;
8422
8423	if (path != NULL)
8424		pathBuffer.SetTo(path);
8425
8426	return attr_dir_open(fd, path ? pathBuffer.LockBuffer() : NULL,
8427		traverseLeafLink, true);
8428}
8429
8430
8431int
8432_kern_open_attr(int fd, const char* path, const char* name, uint32 type,
8433	int openMode)
8434{
8435	KPath pathBuffer(path, false, B_PATH_NAME_LENGTH + 1);
8436	if (pathBuffer.InitCheck() != B_OK)
8437		return B_NO_MEMORY;
8438
8439	if ((openMode & O_CREAT) != 0) {
8440		return attr_create(fd, pathBuffer.LockBuffer(), name, type, openMode,
8441			true);
8442	}
8443
8444	return attr_open(fd, pathBuffer.LockBuffer(), name, openMode, true);
8445}
8446
8447
8448status_t
8449_kern_remove_attr(int fd, const char* name)
8450{
8451	return attr_remove(fd, name, true);
8452}
8453
8454
8455status_t
8456_kern_rename_attr(int fromFile, const char* fromName, int toFile,
8457	const char* toName)
8458{
8459	return attr_rename(fromFile, fromName, toFile, toName, true);
8460}
8461
8462
8463int
8464_kern_open_index_dir(dev_t device)
8465{
8466	return index_dir_open(device, true);
8467}
8468
8469
8470status_t
8471_kern_create_index(dev_t device, const char* name, uint32 type, uint32 flags)
8472{
8473	return index_create(device, name, type, flags, true);
8474}
8475
8476
8477status_t
8478_kern_read_index_stat(dev_t device, const char* name, struct stat* stat)
8479{
8480	return index_name_read_stat(device, name, stat, true);
8481}
8482
8483
8484status_t
8485_kern_remove_index(dev_t device, const char* name)
8486{
8487	return index_remove(device, name, true);
8488}
8489
8490
8491status_t
8492_kern_getcwd(char* buffer, size_t size)
8493{
8494	TRACE(("_kern_getcwd: buf %p, %ld\n", buffer, size));
8495
8496	// Call vfs to get current working directory
8497	return get_cwd(buffer, size, true);
8498}
8499
8500
8501status_t
8502_kern_setcwd(int fd, const char* path)
8503{
8504	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
8505	if (pathBuffer.InitCheck() != B_OK)
8506		return B_NO_MEMORY;
8507
8508	if (path != NULL)
8509		pathBuffer.SetTo(path);
8510
8511	return set_cwd(fd, path != NULL ? pathBuffer.LockBuffer() : NULL, true);
8512}
8513
8514
8515//	#pragma mark - userland syscalls
8516
8517
8518dev_t
8519_user_mount(const char* userPath, const char* userDevice,
8520	const char* userFileSystem, uint32 flags, const char* userArgs,
8521	size_t argsLength)
8522{
8523	char fileSystem[B_FILE_NAME_LENGTH];
8524	KPath path, device;
8525	char* args = NULL;
8526	status_t status;
8527
8528	if (!IS_USER_ADDRESS(userPath)
8529		|| !IS_USER_ADDRESS(userFileSystem)
8530		|| !IS_USER_ADDRESS(userDevice))
8531		return B_BAD_ADDRESS;
8532
8533	if (path.InitCheck() != B_OK || device.InitCheck() != B_OK)
8534		return B_NO_MEMORY;
8535
8536	if (user_strlcpy(path.LockBuffer(), userPath, B_PATH_NAME_LENGTH) < B_OK)
8537		return B_BAD_ADDRESS;
8538
8539	if (userFileSystem != NULL
8540		&& user_strlcpy(fileSystem, userFileSystem, sizeof(fileSystem)) < B_OK)
8541		return B_BAD_ADDRESS;
8542
8543	if (userDevice != NULL
8544		&& user_strlcpy(device.LockBuffer(), userDevice, B_PATH_NAME_LENGTH)
8545			< B_OK)
8546		return B_BAD_ADDRESS;
8547
8548	if (userArgs != NULL && argsLength > 0) {
8549		// this is a safety restriction
8550		if (argsLength >= 65536)
8551			return B_NAME_TOO_LONG;
8552
8553		args = (char*)malloc(argsLength + 1);
8554		if (args == NULL)
8555			return B_NO_MEMORY;
8556
8557		if (user_strlcpy(args, userArgs, argsLength + 1) < B_OK) {
8558			free(args);
8559			return B_BAD_ADDRESS;
8560		}
8561	}
8562	path.UnlockBuffer();
8563	device.UnlockBuffer();
8564
8565	status = fs_mount(path.LockBuffer(),
8566		userDevice != NULL ? device.Path() : NULL,
8567		userFileSystem ? fileSystem : NULL, flags, args, false);
8568
8569	free(args);
8570	return status;
8571}
8572
8573
8574status_t
8575_user_unmount(const char* userPath, uint32 flags)
8576{
8577	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
8578	if (pathBuffer.InitCheck() != B_OK)
8579		return B_NO_MEMORY;
8580
8581	char* path = pathBuffer.LockBuffer();
8582
8583	if (user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
8584		return B_BAD_ADDRESS;
8585
8586	return fs_unmount(path, -1, flags & ~B_UNMOUNT_BUSY_PARTITION, false);
8587}
8588
8589
8590status_t
8591_user_read_fs_info(dev_t device, struct fs_info* userInfo)
8592{
8593	struct fs_info info;
8594	status_t status;
8595
8596	if (userInfo == NULL)
8597		return B_BAD_VALUE;
8598
8599	if (!IS_USER_ADDRESS(userInfo))
8600		return B_BAD_ADDRESS;
8601
8602	status = fs_read_info(device, &info);
8603	if (status != B_OK)
8604		return status;
8605
8606	if (user_memcpy(userInfo, &info, sizeof(struct fs_info)) != B_OK)
8607		return B_BAD_ADDRESS;
8608
8609	return B_OK;
8610}
8611
8612
8613status_t
8614_user_write_fs_info(dev_t device, const struct fs_info* userInfo, int mask)
8615{
8616	struct fs_info info;
8617
8618	if (userInfo == NULL)
8619		return B_BAD_VALUE;
8620
8621	if (!IS_USER_ADDRESS(userInfo)
8622		|| user_memcpy(&info, userInfo, sizeof(struct fs_info)) != B_OK)
8623		return B_BAD_ADDRESS;
8624
8625	return fs_write_info(device, &info, mask);
8626}
8627
8628
8629dev_t
8630_user_next_device(int32* _userCookie)
8631{
8632	int32 cookie;
8633	dev_t device;
8634
8635	if (!IS_USER_ADDRESS(_userCookie)
8636		|| user_memcpy(&cookie, _userCookie, sizeof(int32)) != B_OK)
8637		return B_BAD_ADDRESS;
8638
8639	device = fs_next_device(&cookie);
8640
8641	if (device >= B_OK) {
8642		// update user cookie
8643		if (user_memcpy(_userCookie, &cookie, sizeof(int32)) != B_OK)
8644			return B_BAD_ADDRESS;
8645	}
8646
8647	return device;
8648}
8649
8650
8651status_t
8652_user_sync(void)
8653{
8654	return _kern_sync();
8655}
8656
8657
8658status_t
8659_user_get_next_fd_info(team_id team, uint32* userCookie, fd_info* userInfo,
8660	size_t infoSize)
8661{
8662	struct fd_info info;
8663	uint32 cookie;
8664
8665	// only root can do this (or should root's group be enough?)
8666	if (geteuid() != 0)
8667		return B_NOT_ALLOWED;
8668
8669	if (infoSize != sizeof(fd_info))
8670		return B_BAD_VALUE;
8671
8672	if (!IS_USER_ADDRESS(userCookie) || !IS_USER_ADDRESS(userInfo)
8673		|| user_memcpy(&cookie, userCookie, sizeof(uint32)) != B_OK)
8674		return B_BAD_ADDRESS;
8675
8676	status_t status = _kern_get_next_fd_info(team, &cookie, &info, infoSize);
8677	if (status != B_OK)
8678		return status;
8679
8680	if (user_memcpy(userCookie, &cookie, sizeof(uint32)) != B_OK
8681		|| user_memcpy(userInfo, &info, infoSize) != B_OK)
8682		return B_BAD_ADDRESS;
8683
8684	return status;
8685}
8686
8687
8688status_t
8689_user_entry_ref_to_path(dev_t device, ino_t inode, const char* leaf,
8690	char* userPath, size_t pathLength)
8691{
8692	if (!IS_USER_ADDRESS(userPath))
8693		return B_BAD_ADDRESS;
8694
8695	KPath path(B_PATH_NAME_LENGTH + 1);
8696	if (path.InitCheck() != B_OK)
8697		return B_NO_MEMORY;
8698
8699	// copy the leaf name onto the stack
8700	char stackLeaf[B_FILE_NAME_LENGTH];
8701	if (leaf) {
8702		if (!IS_USER_ADDRESS(leaf))
8703			return B_BAD_ADDRESS;
8704
8705		int length = user_strlcpy(stackLeaf, leaf, B_FILE_NAME_LENGTH);
8706		if (length < 0)
8707			return length;
8708		if (length >= B_FILE_NAME_LENGTH)
8709			return B_NAME_TOO_LONG;
8710
8711		leaf = stackLeaf;
8712	}
8713
8714	status_t status = vfs_entry_ref_to_path(device, inode, leaf,
8715		path.LockBuffer(), path.BufferSize());
8716	if (status != B_OK)
8717		return status;
8718
8719	path.UnlockBuffer();
8720
8721	int length = user_strlcpy(userPath, path.Path(), pathLength);
8722	if (length < 0)
8723		return length;
8724	if (length >= (int)pathLength)
8725		return B_BUFFER_OVERFLOW;
8726
8727	return B_OK;
8728}
8729
8730
8731status_t
8732_user_normalize_path(const char* userPath, bool traverseLink, char* buffer)
8733{
8734	if (userPath == NULL || buffer == NULL)
8735		return B_BAD_VALUE;
8736	if (!IS_USER_ADDRESS(userPath) || !IS_USER_ADDRESS(buffer))
8737		return B_BAD_ADDRESS;
8738
8739	// copy path from userland
8740	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
8741	if (pathBuffer.InitCheck() != B_OK)
8742		return B_NO_MEMORY;
8743	char* path = pathBuffer.LockBuffer();
8744
8745	if (user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
8746		return B_BAD_ADDRESS;
8747
8748	status_t error = normalize_path(path, pathBuffer.BufferSize(), traverseLink,
8749		false);
8750	if (error != B_OK)
8751		return error;
8752
8753	// copy back to userland
8754	int len = user_strlcpy(buffer, path, B_PATH_NAME_LENGTH);
8755	if (len < 0)
8756		return len;
8757	if (len >= B_PATH_NAME_LENGTH)
8758		return B_BUFFER_OVERFLOW;
8759
8760	return B_OK;
8761}
8762
8763
8764int
8765_user_open_entry_ref(dev_t device, ino_t inode, const char* userName,
8766	int openMode, int perms)
8767{
8768	char name[B_FILE_NAME_LENGTH];
8769
8770	if (userName == NULL || device < 0 || inode < 0)
8771		return B_BAD_VALUE;
8772	if (!IS_USER_ADDRESS(userName)
8773		|| user_strlcpy(name, userName, sizeof(name)) < B_OK)
8774		return B_BAD_ADDRESS;
8775
8776	if ((openMode & O_CREAT) != 0) {
8777		return file_create_entry_ref(device, inode, name, openMode, perms,
8778		 false);
8779	}
8780
8781	return file_open_entry_ref(device, inode, name, openMode, false);
8782}
8783
8784
8785int
8786_user_open(int fd, const char* userPath, int openMode, int perms)
8787{
8788	KPath path(B_PATH_NAME_LENGTH + 1);
8789	if (path.InitCheck() != B_OK)
8790		return B_NO_MEMORY;
8791
8792	char* buffer = path.LockBuffer();
8793
8794	if (!IS_USER_ADDRESS(userPath)
8795		|| user_strlcpy(buffer, userPath, B_PATH_NAME_LENGTH) < B_OK)
8796		return B_BAD_ADDRESS;
8797
8798	if ((openMode & O_CREAT) != 0)
8799		return file_create(fd, buffer, openMode, perms, false);
8800
8801	return file_open(fd, buffer, openMode, false);
8802}
8803
8804
8805int
8806_user_open_dir_entry_ref(dev_t device, ino_t inode, const char* userName)
8807{
8808	if (userName != NULL) {
8809		char name[B_FILE_NAME_LENGTH];
8810
8811		if (!IS_USER_ADDRESS(userName)
8812			|| user_strlcpy(name, userName, sizeof(name)) < B_OK)
8813			return B_BAD_ADDRESS;
8814
8815		return dir_open_entry_ref(device, inode, name, false);
8816	}
8817	return dir_open_entry_ref(device, inode, NULL, false);
8818}
8819
8820
8821int
8822_user_open_dir(int fd, const char* userPath)
8823{
8824	if (userPath == NULL)
8825		return dir_open(fd, NULL, false);
8826
8827	KPath path(B_PATH_NAME_LENGTH + 1);
8828	if (path.InitCheck() != B_OK)
8829		return B_NO_MEMORY;
8830
8831	char* buffer = path.LockBuffer();
8832
8833	if (!IS_USER_ADDRESS(userPath)
8834		|| user_strlcpy(buffer, userPath, B_PATH_NAME_LENGTH) < B_OK)
8835		return B_BAD_ADDRESS;
8836
8837	return dir_open(fd, buffer, false);
8838}
8839
8840
8841/*!	\brief Opens a directory's parent directory and returns the entry name
8842		   of the former.
8843
8844	Aside from that it returns the directory's entry name, this method is
8845	equivalent to \code _user_open_dir(fd, "..") \endcode. It really is
8846	equivalent, if \a userName is \c NULL.
8847
8848	If a name buffer is supplied and the name does not fit the buffer, the
8849	function fails. A buffer of size \c B_FILE_NAME_LENGTH should be safe.
8850
8851	\param fd A FD referring to a directory.
8852	\param userName Buffer the directory's entry name shall be written into.
8853		   May be \c NULL.
8854	\param nameLength Size of the name buffer.
8855	\return The file descriptor of the opened parent directory, if everything
8856			went fine, an error code otherwise.
8857*/
8858int
8859_user_open_parent_dir(int fd, char* userName, size_t nameLength)
8860{
8861	bool kernel = false;
8862
8863	if (userName && !IS_USER_ADDRESS(userName))
8864		return B_BAD_ADDRESS;
8865
8866	// open the parent dir
8867	int parentFD = dir_open(fd, (char*)"..", kernel);
8868	if (parentFD < 0)
8869		return parentFD;
8870	FDCloser fdCloser(parentFD, kernel);
8871
8872	if (userName) {
8873		// get the vnodes
8874		struct vnode* parentVNode = get_vnode_from_fd(parentFD, kernel);
8875		struct vnode* dirVNode = get_vnode_from_fd(fd, kernel);
8876		VNodePutter parentVNodePutter(parentVNode);
8877		VNodePutter dirVNodePutter(dirVNode);
8878		if (!parentVNode || !dirVNode)
8879			return B_FILE_ERROR;
8880
8881		// get the vnode name
8882		char _buffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
8883		struct dirent* buffer = (struct dirent*)_buffer;
8884		status_t status = get_vnode_name(dirVNode, parentVNode, buffer,
8885			sizeof(_buffer), get_current_io_context(false));
8886		if (status != B_OK)
8887			return status;
8888
8889		// copy the name to the userland buffer
8890		int len = user_strlcpy(userName, buffer->d_name, nameLength);
8891		if (len < 0)
8892			return len;
8893		if (len >= (int)nameLength)
8894			return B_BUFFER_OVERFLOW;
8895	}
8896
8897	return fdCloser.Detach();
8898}
8899
8900
8901status_t
8902_user_fcntl(int fd, int op, size_t argument)
8903{
8904	status_t status = common_fcntl(fd, op, argument, false);
8905	if (op == F_SETLKW)
8906		syscall_restart_handle_post(status);
8907
8908	return status;
8909}
8910
8911
8912status_t
8913_user_fsync(int fd)
8914{
8915	return common_sync(fd, false);
8916}
8917
8918
8919status_t
8920_user_flock(int fd, int operation)
8921{
8922	FUNCTION(("_user_fcntl(fd = %d, op = %d)\n", fd, operation));
8923
8924	// Check if the operation is valid
8925	switch (operation & ~LOCK_NB) {
8926		case LOCK_UN:
8927		case LOCK_SH:
8928		case LOCK_EX:
8929			break;
8930
8931		default:
8932			return B_BAD_VALUE;
8933	}
8934
8935	struct file_descriptor* descriptor;
8936	struct vnode* vnode;
8937	descriptor = get_fd_and_vnode(fd, &vnode, false);
8938	if (descriptor == NULL)
8939		return B_FILE_ERROR;
8940
8941	if (descriptor->type != FDTYPE_FILE) {
8942		put_fd(descriptor);
8943		return B_BAD_VALUE;
8944	}
8945
8946	struct flock flock;
8947	flock.l_start = 0;
8948	flock.l_len = OFF_MAX;
8949	flock.l_whence = 0;
8950	flock.l_type = (operation & LOCK_SH) != 0 ? F_RDLCK : F_WRLCK;
8951
8952	status_t status;
8953	if ((operation & LOCK_UN) != 0)
8954		status = release_advisory_lock(vnode, &flock);
8955	else {
8956		status = acquire_advisory_lock(vnode,
8957			thread_get_current_thread()->team->session_id, &flock,
8958			(operation & LOCK_NB) == 0);
8959	}
8960
8961	syscall_restart_handle_post(status);
8962
8963	put_fd(descriptor);
8964	return status;
8965}
8966
8967
8968status_t
8969_user_lock_node(int fd)
8970{
8971	return common_lock_node(fd, false);
8972}
8973
8974
8975status_t
8976_user_unlock_node(int fd)
8977{
8978	return common_unlock_node(fd, false);
8979}
8980
8981
8982status_t
8983_user_create_dir_entry_ref(dev_t device, ino_t inode, const char* userName,
8984	int perms)
8985{
8986	char name[B_FILE_NAME_LENGTH];
8987	status_t status;
8988
8989	if (!IS_USER_ADDRESS(userName))
8990		return B_BAD_ADDRESS;
8991
8992	status = user_strlcpy(name, userName, sizeof(name));
8993	if (status < 0)
8994		return status;
8995
8996	return dir_create_entry_ref(device, inode, name, perms, false);
8997}
8998
8999
9000status_t
9001_user_create_dir(int fd, const char* userPath, int perms)
9002{
9003	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9004	if (pathBuffer.InitCheck() != B_OK)
9005		return B_NO_MEMORY;
9006
9007	char* path = pathBuffer.LockBuffer();
9008
9009	if (!IS_USER_ADDRESS(userPath)
9010		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9011		return B_BAD_ADDRESS;
9012
9013	return dir_create(fd, path, perms, false);
9014}
9015
9016
9017status_t
9018_user_remove_dir(int fd, const char* userPath)
9019{
9020	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9021	if (pathBuffer.InitCheck() != B_OK)
9022		return B_NO_MEMORY;
9023
9024	char* path = pathBuffer.LockBuffer();
9025
9026	if (userPath != NULL) {
9027		if (!IS_USER_ADDRESS(userPath)
9028			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9029			return B_BAD_ADDRESS;
9030	}
9031
9032	return dir_remove(fd, userPath ? path : NULL, false);
9033}
9034
9035
9036status_t
9037_user_read_link(int fd, const char* userPath, char* userBuffer,
9038	size_t* userBufferSize)
9039{
9040	KPath pathBuffer(B_PATH_NAME_LENGTH + 1), linkBuffer;
9041	if (pathBuffer.InitCheck() != B_OK || linkBuffer.InitCheck() != B_OK)
9042		return B_NO_MEMORY;
9043
9044	size_t bufferSize;
9045
9046	if (!IS_USER_ADDRESS(userBuffer) || !IS_USER_ADDRESS(userBufferSize)
9047		|| user_memcpy(&bufferSize, userBufferSize, sizeof(size_t)) != B_OK)
9048		return B_BAD_ADDRESS;
9049
9050	char* path = pathBuffer.LockBuffer();
9051	char* buffer = linkBuffer.LockBuffer();
9052
9053	if (userPath) {
9054		if (!IS_USER_ADDRESS(userPath)
9055			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9056			return B_BAD_ADDRESS;
9057
9058		if (bufferSize > B_PATH_NAME_LENGTH)
9059			bufferSize = B_PATH_NAME_LENGTH;
9060	}
9061
9062	status_t status = common_read_link(fd, userPath ? path : NULL, buffer,
9063		&bufferSize, false);
9064
9065	// we also update the bufferSize in case of errors
9066	// (the real length will be returned in case of B_BUFFER_OVERFLOW)
9067	if (user_memcpy(userBufferSize, &bufferSize, sizeof(size_t)) != B_OK)
9068		return B_BAD_ADDRESS;
9069
9070	if (status != B_OK)
9071		return status;
9072
9073	if (user_memcpy(userBuffer, buffer, bufferSize) != B_OK)
9074		return B_BAD_ADDRESS;
9075
9076	return B_OK;
9077}
9078
9079
9080status_t
9081_user_create_symlink(int fd, const char* userPath, const char* userToPath,
9082	int mode)
9083{
9084	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9085	KPath toPathBuffer(B_PATH_NAME_LENGTH + 1);
9086	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
9087		return B_NO_MEMORY;
9088
9089	char* path = pathBuffer.LockBuffer();
9090	char* toPath = toPathBuffer.LockBuffer();
9091
9092	if (!IS_USER_ADDRESS(userPath)
9093		|| !IS_USER_ADDRESS(userToPath)
9094		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK
9095		|| user_strlcpy(toPath, userToPath, B_PATH_NAME_LENGTH) < B_OK)
9096		return B_BAD_ADDRESS;
9097
9098	return common_create_symlink(fd, path, toPath, mode, false);
9099}
9100
9101
9102status_t
9103_user_create_link(int pathFD, const char* userPath, int toFD,
9104	const char* userToPath, bool traverseLeafLink)
9105{
9106	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9107	KPath toPathBuffer(B_PATH_NAME_LENGTH + 1);
9108	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
9109		return B_NO_MEMORY;
9110
9111	char* path = pathBuffer.LockBuffer();
9112	char* toPath = toPathBuffer.LockBuffer();
9113
9114	if (!IS_USER_ADDRESS(userPath)
9115		|| !IS_USER_ADDRESS(userToPath)
9116		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK
9117		|| user_strlcpy(toPath, userToPath, B_PATH_NAME_LENGTH) < B_OK)
9118		return B_BAD_ADDRESS;
9119
9120	status_t status = check_path(toPath);
9121	if (status != B_OK)
9122		return status;
9123
9124	return common_create_link(pathFD, path, toFD, toPath, traverseLeafLink,
9125		false);
9126}
9127
9128
9129status_t
9130_user_unlink(int fd, const char* userPath)
9131{
9132	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9133	if (pathBuffer.InitCheck() != B_OK)
9134		return B_NO_MEMORY;
9135
9136	char* path = pathBuffer.LockBuffer();
9137
9138	if (!IS_USER_ADDRESS(userPath)
9139		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9140		return B_BAD_ADDRESS;
9141
9142	return common_unlink(fd, path, false);
9143}
9144
9145
9146status_t
9147_user_rename(int oldFD, const char* userOldPath, int newFD,
9148	const char* userNewPath)
9149{
9150	KPath oldPathBuffer(B_PATH_NAME_LENGTH + 1);
9151	KPath newPathBuffer(B_PATH_NAME_LENGTH + 1);
9152	if (oldPathBuffer.InitCheck() != B_OK || newPathBuffer.InitCheck() != B_OK)
9153		return B_NO_MEMORY;
9154
9155	char* oldPath = oldPathBuffer.LockBuffer();
9156	char* newPath = newPathBuffer.LockBuffer();
9157
9158	if (!IS_USER_ADDRESS(userOldPath) || !IS_USER_ADDRESS(userNewPath)
9159		|| user_strlcpy(oldPath, userOldPath, B_PATH_NAME_LENGTH) < B_OK
9160		|| user_strlcpy(newPath, userNewPath, B_PATH_NAME_LENGTH) < B_OK)
9161		return B_BAD_ADDRESS;
9162
9163	return common_rename(oldFD, oldPath, newFD, newPath, false);
9164}
9165
9166
9167status_t
9168_user_create_fifo(int fd, const char* userPath, mode_t perms)
9169{
9170	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9171	if (pathBuffer.InitCheck() != B_OK)
9172		return B_NO_MEMORY;
9173
9174	char* path = pathBuffer.LockBuffer();
9175
9176	if (!IS_USER_ADDRESS(userPath)
9177		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK) {
9178		return B_BAD_ADDRESS;
9179	}
9180
9181	// split into directory vnode and filename path
9182	char filename[B_FILE_NAME_LENGTH];
9183	struct vnode* dir;
9184	status_t status = fd_and_path_to_dir_vnode(fd, path, &dir, filename, false);
9185	if (status != B_OK)
9186		return status;
9187
9188	VNodePutter _(dir);
9189
9190	// the underlying FS needs to support creating FIFOs
9191	if (!HAS_FS_CALL(dir, create_special_node))
9192		return B_UNSUPPORTED;
9193
9194	// create the entry	-- the FIFO sub node is set up automatically
9195	fs_vnode superVnode;
9196	ino_t nodeID;
9197	status = FS_CALL(dir, create_special_node, filename, NULL,
9198		S_IFIFO | (perms & S_IUMSK), 0, &superVnode, &nodeID);
9199
9200	// create_special_node() acquired a reference for us that we don't need.
9201	if (status == B_OK)
9202		put_vnode(dir->mount->volume, nodeID);
9203
9204	return status;
9205}
9206
9207
9208status_t
9209_user_create_pipe(int* userFDs)
9210{
9211	// rootfs should support creating FIFOs, but let's be sure
9212	if (!HAS_FS_CALL(sRoot, create_special_node))
9213		return B_UNSUPPORTED;
9214
9215	// create the node	-- the FIFO sub node is set up automatically
9216	fs_vnode superVnode;
9217	ino_t nodeID;
9218	status_t status = FS_CALL(sRoot, create_special_node, NULL, NULL,
9219		S_IFIFO | S_IRUSR | S_IWUSR, 0, &superVnode, &nodeID);
9220	if (status != B_OK)
9221		return status;
9222
9223	// We've got one reference to the node and need another one.
9224	struct vnode* vnode;
9225	status = get_vnode(sRoot->mount->id, nodeID, &vnode, true, false);
9226	if (status != B_OK) {
9227		// that should not happen
9228		dprintf("_user_create_pipe(): Failed to lookup vnode (%" B_PRIdDEV ", "
9229			"%" B_PRIdINO ")\n", sRoot->mount->id, sRoot->id);
9230		return status;
9231	}
9232
9233	// Everything looks good so far. Open two FDs for reading respectively
9234	// writing.
9235	int fds[2];
9236	fds[0] = open_vnode(vnode, O_RDONLY, false);
9237	fds[1] = open_vnode(vnode, O_WRONLY, false);
9238
9239	FDCloser closer0(fds[0], false);
9240	FDCloser closer1(fds[1], false);
9241
9242	status = (fds[0] >= 0 ? (fds[1] >= 0 ? B_OK : fds[1]) : fds[0]);
9243
9244	// copy FDs to userland
9245	if (status == B_OK) {
9246		if (!IS_USER_ADDRESS(userFDs)
9247			|| user_memcpy(userFDs, fds, sizeof(fds)) != B_OK) {
9248			status = B_BAD_ADDRESS;
9249		}
9250	}
9251
9252	// keep FDs, if everything went fine
9253	if (status == B_OK) {
9254		closer0.Detach();
9255		closer1.Detach();
9256	}
9257
9258	return status;
9259}
9260
9261
9262status_t
9263_user_access(int fd, const char* userPath, int mode, bool effectiveUserGroup)
9264{
9265	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9266	if (pathBuffer.InitCheck() != B_OK)
9267		return B_NO_MEMORY;
9268
9269	char* path = pathBuffer.LockBuffer();
9270
9271	if (!IS_USER_ADDRESS(userPath)
9272		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9273		return B_BAD_ADDRESS;
9274
9275	return common_access(fd, path, mode, effectiveUserGroup, false);
9276}
9277
9278
9279status_t
9280_user_read_stat(int fd, const char* userPath, bool traverseLink,
9281	struct stat* userStat, size_t statSize)
9282{
9283	struct stat stat;
9284	status_t status;
9285
9286	if (statSize > sizeof(struct stat))
9287		return B_BAD_VALUE;
9288
9289	if (!IS_USER_ADDRESS(userStat))
9290		return B_BAD_ADDRESS;
9291
9292	if (userPath) {
9293		// path given: get the stat of the node referred to by (fd, path)
9294		if (!IS_USER_ADDRESS(userPath))
9295			return B_BAD_ADDRESS;
9296
9297		KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9298		if (pathBuffer.InitCheck() != B_OK)
9299			return B_NO_MEMORY;
9300
9301		char* path = pathBuffer.LockBuffer();
9302
9303		ssize_t length = user_strlcpy(path, userPath, B_PATH_NAME_LENGTH);
9304		if (length < B_OK)
9305			return length;
9306		if (length >= B_PATH_NAME_LENGTH)
9307			return B_NAME_TOO_LONG;
9308
9309		status = common_path_read_stat(fd, path, traverseLink, &stat, false);
9310	} else {
9311		// no path given: get the FD and use the FD operation
9312		struct file_descriptor* descriptor
9313			= get_fd(get_current_io_context(false), fd);
9314		if (descriptor == NULL)
9315			return B_FILE_ERROR;
9316
9317		if (descriptor->ops->fd_read_stat)
9318			status = descriptor->ops->fd_read_stat(descriptor, &stat);
9319		else
9320			status = B_UNSUPPORTED;
9321
9322		put_fd(descriptor);
9323	}
9324
9325	if (status != B_OK)
9326		return status;
9327
9328	return user_memcpy(userStat, &stat, statSize);
9329}
9330
9331
9332status_t
9333_user_write_stat(int fd, const char* userPath, bool traverseLeafLink,
9334	const struct stat* userStat, size_t statSize, int statMask)
9335{
9336	if (statSize > sizeof(struct stat))
9337		return B_BAD_VALUE;
9338
9339	struct stat stat;
9340
9341	if (!IS_USER_ADDRESS(userStat)
9342		|| user_memcpy(&stat, userStat, statSize) < B_OK)
9343		return B_BAD_ADDRESS;
9344
9345	// clear additional stat fields
9346	if (statSize < sizeof(struct stat))
9347		memset((uint8*)&stat + statSize, 0, sizeof(struct stat) - statSize);
9348
9349	status_t status;
9350
9351	if (userPath) {
9352		// path given: write the stat of the node referred to by (fd, path)
9353		if (!IS_USER_ADDRESS(userPath))
9354			return B_BAD_ADDRESS;
9355
9356		KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9357		if (pathBuffer.InitCheck() != B_OK)
9358			return B_NO_MEMORY;
9359
9360		char* path = pathBuffer.LockBuffer();
9361
9362		ssize_t length = user_strlcpy(path, userPath, B_PATH_NAME_LENGTH);
9363		if (length < B_OK)
9364			return length;
9365		if (length >= B_PATH_NAME_LENGTH)
9366			return B_NAME_TOO_LONG;
9367
9368		status = common_path_write_stat(fd, path, traverseLeafLink, &stat,
9369			statMask, false);
9370	} else {
9371		// no path given: get the FD and use the FD operation
9372		struct file_descriptor* descriptor
9373			= get_fd(get_current_io_context(false), fd);
9374		if (descriptor == NULL)
9375			return B_FILE_ERROR;
9376
9377		if (descriptor->ops->fd_write_stat) {
9378			status = descriptor->ops->fd_write_stat(descriptor, &stat,
9379				statMask);
9380		} else
9381			status = B_UNSUPPORTED;
9382
9383		put_fd(descriptor);
9384	}
9385
9386	return status;
9387}
9388
9389
9390int
9391_user_open_attr_dir(int fd, const char* userPath, bool traverseLeafLink)
9392{
9393	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9394	if (pathBuffer.InitCheck() != B_OK)
9395		return B_NO_MEMORY;
9396
9397	char* path = pathBuffer.LockBuffer();
9398
9399	if (userPath != NULL) {
9400		if (!IS_USER_ADDRESS(userPath)
9401			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9402			return B_BAD_ADDRESS;
9403	}
9404
9405	return attr_dir_open(fd, userPath ? path : NULL, traverseLeafLink, false);
9406}
9407
9408
9409ssize_t
9410_user_read_attr(int fd, const char* attribute, off_t pos, void* userBuffer,
9411	size_t readBytes)
9412{
9413	int attr = attr_open(fd, NULL, attribute, O_RDONLY, false);
9414	if (attr < 0)
9415		return attr;
9416
9417	ssize_t bytes = _user_read(attr, pos, userBuffer, readBytes);
9418	_user_close(attr);
9419
9420	return bytes;
9421}
9422
9423
9424ssize_t
9425_user_write_attr(int fd, const char* attribute, uint32 type, off_t pos,
9426	const void* buffer, size_t writeBytes)
9427{
9428	// Try to support the BeOS typical truncation as well as the position
9429	// argument
9430	int attr = attr_create(fd, NULL, attribute, type,
9431		O_CREAT | O_WRONLY | (pos != 0 ? 0 : O_TRUNC), false);
9432	if (attr < 0)
9433		return attr;
9434
9435	ssize_t bytes = _user_write(attr, pos, buffer, writeBytes);
9436	_user_close(attr);
9437
9438	return bytes;
9439}
9440
9441
9442status_t
9443_user_stat_attr(int fd, const char* attribute, struct attr_info* userAttrInfo)
9444{
9445	int attr = attr_open(fd, NULL, attribute, O_RDONLY, false);
9446	if (attr < 0)
9447		return attr;
9448
9449	struct file_descriptor* descriptor
9450		= get_fd(get_current_io_context(false), attr);
9451	if (descriptor == NULL) {
9452		_user_close(attr);
9453		return B_FILE_ERROR;
9454	}
9455
9456	struct stat stat;
9457	status_t status;
9458	if (descriptor->ops->fd_read_stat)
9459		status = descriptor->ops->fd_read_stat(descriptor, &stat);
9460	else
9461		status = B_UNSUPPORTED;
9462
9463	put_fd(descriptor);
9464	_user_close(attr);
9465
9466	if (status == B_OK) {
9467		attr_info info;
9468		info.type = stat.st_type;
9469		info.size = stat.st_size;
9470
9471		if (user_memcpy(userAttrInfo, &info, sizeof(struct attr_info)) != B_OK)
9472			return B_BAD_ADDRESS;
9473	}
9474
9475	return status;
9476}
9477
9478
9479int
9480_user_open_attr(int fd, const char* userPath, const char* userName,
9481	uint32 type, int openMode)
9482{
9483	char name[B_FILE_NAME_LENGTH];
9484
9485	if (!IS_USER_ADDRESS(userName)
9486		|| user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9487		return B_BAD_ADDRESS;
9488
9489	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9490	if (pathBuffer.InitCheck() != B_OK)
9491		return B_NO_MEMORY;
9492
9493	char* path = pathBuffer.LockBuffer();
9494
9495	if (userPath != NULL) {
9496		if (!IS_USER_ADDRESS(userPath)
9497			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9498			return B_BAD_ADDRESS;
9499	}
9500
9501	if ((openMode & O_CREAT) != 0) {
9502		return attr_create(fd, userPath ? path : NULL, name, type, openMode,
9503			false);
9504	}
9505
9506	return attr_open(fd, userPath ? path : NULL, name, openMode, false);
9507}
9508
9509
9510status_t
9511_user_remove_attr(int fd, const char* userName)
9512{
9513	char name[B_FILE_NAME_LENGTH];
9514
9515	if (!IS_USER_ADDRESS(userName)
9516		|| user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9517		return B_BAD_ADDRESS;
9518
9519	return attr_remove(fd, name, false);
9520}
9521
9522
9523status_t
9524_user_rename_attr(int fromFile, const char* userFromName, int toFile,
9525	const char* userToName)
9526{
9527	if (!IS_USER_ADDRESS(userFromName)
9528		|| !IS_USER_ADDRESS(userToName))
9529		return B_BAD_ADDRESS;
9530
9531	KPath fromNameBuffer(B_FILE_NAME_LENGTH);
9532	KPath toNameBuffer(B_FILE_NAME_LENGTH);
9533	if (fromNameBuffer.InitCheck() != B_OK || toNameBuffer.InitCheck() != B_OK)
9534		return B_NO_MEMORY;
9535
9536	char* fromName = fromNameBuffer.LockBuffer();
9537	char* toName = toNameBuffer.LockBuffer();
9538
9539	if (user_strlcpy(fromName, userFromName, B_FILE_NAME_LENGTH) < B_OK
9540		|| user_strlcpy(toName, userToName, B_FILE_NAME_LENGTH) < B_OK)
9541		return B_BAD_ADDRESS;
9542
9543	return attr_rename(fromFile, fromName, toFile, toName, false);
9544}
9545
9546
9547int
9548_user_open_index_dir(dev_t device)
9549{
9550	return index_dir_open(device, false);
9551}
9552
9553
9554status_t
9555_user_create_index(dev_t device, const char* userName, uint32 type,
9556	uint32 flags)
9557{
9558	char name[B_FILE_NAME_LENGTH];
9559
9560	if (!IS_USER_ADDRESS(userName)
9561		|| user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9562		return B_BAD_ADDRESS;
9563
9564	return index_create(device, name, type, flags, false);
9565}
9566
9567
9568status_t
9569_user_read_index_stat(dev_t device, const char* userName, struct stat* userStat)
9570{
9571	char name[B_FILE_NAME_LENGTH];
9572	struct stat stat;
9573	status_t status;
9574
9575	if (!IS_USER_ADDRESS(userName)
9576		|| !IS_USER_ADDRESS(userStat)
9577		|| user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9578		return B_BAD_ADDRESS;
9579
9580	status = index_name_read_stat(device, name, &stat, false);
9581	if (status == B_OK) {
9582		if (user_memcpy(userStat, &stat, sizeof(stat)) != B_OK)
9583			return B_BAD_ADDRESS;
9584	}
9585
9586	return status;
9587}
9588
9589
9590status_t
9591_user_remove_index(dev_t device, const char* userName)
9592{
9593	char name[B_FILE_NAME_LENGTH];
9594
9595	if (!IS_USER_ADDRESS(userName)
9596		|| user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9597		return B_BAD_ADDRESS;
9598
9599	return index_remove(device, name, false);
9600}
9601
9602
9603status_t
9604_user_getcwd(char* userBuffer, size_t size)
9605{
9606	if (size == 0)
9607		return B_BAD_VALUE;
9608	if (!IS_USER_ADDRESS(userBuffer))
9609		return B_BAD_ADDRESS;
9610
9611	if (size > kMaxPathLength)
9612		size = kMaxPathLength;
9613
9614	KPath pathBuffer(size);
9615	if (pathBuffer.InitCheck() != B_OK)
9616		return B_NO_MEMORY;
9617
9618	TRACE(("user_getcwd: buf %p, %ld\n", userBuffer, size));
9619
9620	char* path = pathBuffer.LockBuffer();
9621
9622	status_t status = get_cwd(path, size, false);
9623	if (status != B_OK)
9624		return status;
9625
9626	// Copy back the result
9627	if (user_strlcpy(userBuffer, path, size) < B_OK)
9628		return B_BAD_ADDRESS;
9629
9630	return status;
9631}
9632
9633
9634status_t
9635_user_setcwd(int fd, const char* userPath)
9636{
9637	TRACE(("user_setcwd: path = %p\n", userPath));
9638
9639	KPath pathBuffer(B_PATH_NAME_LENGTH);
9640	if (pathBuffer.InitCheck() != B_OK)
9641		return B_NO_MEMORY;
9642
9643	char* path = pathBuffer.LockBuffer();
9644
9645	if (userPath != NULL) {
9646		if (!IS_USER_ADDRESS(userPath)
9647			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9648			return B_BAD_ADDRESS;
9649	}
9650
9651	return set_cwd(fd, userPath != NULL ? path : NULL, false);
9652}
9653
9654
9655status_t
9656_user_change_root(const char* userPath)
9657{
9658	// only root is allowed to chroot()
9659	if (geteuid() != 0)
9660		return B_NOT_ALLOWED;
9661
9662	// alloc path buffer
9663	KPath pathBuffer(B_PATH_NAME_LENGTH);
9664	if (pathBuffer.InitCheck() != B_OK)
9665		return B_NO_MEMORY;
9666
9667	// copy userland path to kernel
9668	char* path = pathBuffer.LockBuffer();
9669	if (userPath != NULL) {
9670		if (!IS_USER_ADDRESS(userPath)
9671			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9672			return B_BAD_ADDRESS;
9673	}
9674
9675	// get the vnode
9676	struct vnode* vnode;
9677	status_t status = path_to_vnode(path, true, &vnode, NULL, false);
9678	if (status != B_OK)
9679		return status;
9680
9681	// set the new root
9682	struct io_context* context = get_current_io_context(false);
9683	mutex_lock(&sIOContextRootLock);
9684	struct vnode* oldRoot = context->root;
9685	context->root = vnode;
9686	mutex_unlock(&sIOContextRootLock);
9687
9688	put_vnode(oldRoot);
9689
9690	return B_OK;
9691}
9692
9693
9694int
9695_user_open_query(dev_t device, const char* userQuery, size_t queryLength,
9696	uint32 flags, port_id port, int32 token)
9697{
9698	char* query;
9699
9700	if (device < 0 || userQuery == NULL || queryLength == 0)
9701		return B_BAD_VALUE;
9702
9703	// this is a safety restriction
9704	if (queryLength >= 65536)
9705		return B_NAME_TOO_LONG;
9706
9707	query = (char*)malloc(queryLength + 1);
9708	if (query == NULL)
9709		return B_NO_MEMORY;
9710	if (user_strlcpy(query, userQuery, queryLength + 1) < B_OK) {
9711		free(query);
9712		return B_BAD_ADDRESS;
9713	}
9714
9715	int fd = query_open(device, query, flags, port, token, false);
9716
9717	free(query);
9718	return fd;
9719}
9720
9721
9722#include "vfs_request_io.cpp"
9723