1/*
2 * Copyright 2001-2020, Axel D��rfler, axeld@pinc-software.de.
3 * This file may be used under the terms of the MIT License.
4 */
5
6
7//! Inode access functions
8
9
10#include "Debug.h"
11#include "Inode.h"
12#include "BPlusTree.h"
13#include "Index.h"
14
15
16#if BFS_TRACING && !defined(FS_SHELL) && !defined(_BOOT_MODE)
17namespace BFSInodeTracing {
18
19class Create : public AbstractTraceEntry {
20public:
21	Create(Inode* inode, Inode* parent, const char* name, int32 mode,
22			int openMode, uint32 type)
23		:
24		fInode(inode),
25		fID(inode->ID()),
26		fParent(parent),
27		fParentID(parent != NULL ? parent->ID() : 0),
28		fMode(mode),
29		fOpenMode(openMode),
30		fType(type)
31	{
32		if (name != NULL)
33			strlcpy(fName, name, sizeof(fName));
34		else
35			fName[0] = '\0';
36
37		Initialized();
38	}
39
40	virtual void AddDump(TraceOutput& out)
41	{
42		out.Print("bfs:Create %lld (%p), parent %lld (%p), \"%s\", "
43			"mode %lx, omode %x, type %lx", fID, fInode, fParentID,
44			fParent, fName, fMode, fOpenMode, fType);
45	}
46
47private:
48	Inode*	fInode;
49	ino_t	fID;
50	Inode*	fParent;
51	ino_t	fParentID;
52	char	fName[32];
53	int32	fMode;
54	int		fOpenMode;
55	uint32	fType;
56};
57
58class Remove : public AbstractTraceEntry {
59public:
60	Remove(Inode* inode, const char* name)
61		:
62		fInode(inode),
63		fID(inode->ID())
64	{
65		strlcpy(fName, name, sizeof(fName));
66		Initialized();
67	}
68
69	virtual void AddDump(TraceOutput& out)
70	{
71		out.Print("bfs:Remove %lld (%p), \"%s\"", fID, fInode, fName);
72	}
73
74private:
75	Inode*	fInode;
76	ino_t	fID;
77	char	fName[32];
78};
79
80class Action : public AbstractTraceEntry {
81public:
82	Action(const char* action, Inode* inode)
83		:
84		fInode(inode),
85		fID(inode->ID())
86	{
87		strlcpy(fAction, action, sizeof(fAction));
88		Initialized();
89	}
90
91	virtual void AddDump(TraceOutput& out)
92	{
93		out.Print("bfs:%s %lld (%p)\n", fAction, fID, fInode);
94	}
95
96private:
97	Inode*	fInode;
98	ino_t	fID;
99	char	fAction[16];
100};
101
102class Resize : public AbstractTraceEntry {
103public:
104	Resize(Inode* inode, off_t oldSize, off_t newSize, bool trim)
105		:
106		fInode(inode),
107		fID(inode->ID()),
108		fOldSize(oldSize),
109		fNewSize(newSize),
110		fTrim(trim)
111	{
112		Initialized();
113	}
114
115	virtual void AddDump(TraceOutput& out)
116	{
117		out.Print("bfs:%s %lld (%p), %lld -> %lld", fTrim ? "Trim" : "Resize",
118			fID, fInode, fOldSize, fNewSize);
119	}
120
121private:
122	Inode*	fInode;
123	ino_t	fID;
124	off_t	fOldSize;
125	off_t	fNewSize;
126	bool	fTrim;
127};
128
129}	// namespace BFSInodeTracing
130
131#	define T(x) new(std::nothrow) BFSInodeTracing::x;
132#else
133#	define T(x) ;
134#endif
135
136
137/*!	A helper class used by Inode::Create() to keep track of the belongings
138	of an inode creation in progress.
139	This class will make sure everything is cleaned up properly.
140*/
141class InodeAllocator {
142public:
143							InodeAllocator(Transaction& transaction);
144							~InodeAllocator();
145
146			status_t		New(block_run* parentRun, mode_t mode, uint32 flags,
147								block_run& run, fs_vnode_ops* vnodeOps,
148								Inode** _inode);
149			status_t		CreateTree();
150			status_t		Keep(fs_vnode_ops* vnodeOps, uint32 publishFlags);
151
152private:
153	static	void			_TransactionListener(int32 id, int32 event,
154								void* _inode);
155
156			Transaction*	fTransaction;
157			block_run		fRun;
158			Inode*			fInode;
159};
160
161
162InodeAllocator::InodeAllocator(Transaction& transaction)
163	:
164	fTransaction(&transaction),
165	fInode(NULL)
166{
167}
168
169
170InodeAllocator::~InodeAllocator()
171{
172	if (fTransaction != NULL) {
173		Volume* volume = fTransaction->GetVolume();
174
175		if (fInode != NULL) {
176			fInode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
177				// this unblocks any pending bfs_read_vnode() calls
178			fInode->Free(*fTransaction);
179
180			if (fInode->fTree != NULL)
181				fTransaction->RemoveListener(fInode->fTree);
182			fTransaction->RemoveListener(fInode);
183
184			remove_vnode(volume->FSVolume(), fInode->ID());
185		} else
186			volume->Free(*fTransaction, fRun);
187	}
188
189	delete fInode;
190}
191
192
193status_t
194InodeAllocator::New(block_run* parentRun, mode_t mode, uint32 publishFlags,
195	block_run& run, fs_vnode_ops* vnodeOps, Inode** _inode)
196{
197	Volume* volume = fTransaction->GetVolume();
198
199	status_t status = volume->AllocateForInode(*fTransaction, parentRun, mode,
200		fRun);
201	if (status < B_OK) {
202		// don't free the space in the destructor, because
203		// the allocation failed
204		fTransaction = NULL;
205		RETURN_ERROR(status);
206	}
207
208	run = fRun;
209	fInode = new(std::nothrow) Inode(volume, *fTransaction,
210		volume->ToVnode(run), mode, run);
211	if (fInode == NULL)
212		RETURN_ERROR(B_NO_MEMORY);
213
214	if (!volume->IsInitializing()
215		&& (publishFlags & BFS_DO_NOT_PUBLISH_VNODE) == 0) {
216		status = new_vnode(volume->FSVolume(), fInode->ID(), fInode,
217			vnodeOps != NULL ? vnodeOps : &gBFSVnodeOps);
218		if (status < B_OK) {
219			delete fInode;
220			fInode = NULL;
221			RETURN_ERROR(status);
222		}
223	}
224
225	fInode->WriteLockInTransaction(*fTransaction);
226	*_inode = fInode;
227	return B_OK;
228}
229
230
231status_t
232InodeAllocator::CreateTree()
233{
234	Volume* volume = fTransaction->GetVolume();
235
236	// force S_STR_INDEX to be set, if no type is set
237	if ((fInode->Mode() & S_INDEX_TYPES) == 0)
238		fInode->Node().mode |= HOST_ENDIAN_TO_BFS_INT32(S_STR_INDEX);
239
240	BPlusTree* tree = new(std::nothrow) BPlusTree(*fTransaction, fInode);
241	if (tree == NULL)
242		return B_ERROR;
243
244	status_t status = tree->InitCheck();
245	if (status != B_OK) {
246		delete tree;
247		return status;
248	}
249
250	fInode->fTree = tree;
251
252	if (fInode->IsRegularNode()) {
253		if (tree->Insert(*fTransaction, ".", fInode->ID()) < B_OK
254			|| tree->Insert(*fTransaction, "..",
255					volume->ToVnode(fInode->Parent())) < B_OK)
256			return B_ERROR;
257	}
258	return B_OK;
259}
260
261
262status_t
263InodeAllocator::Keep(fs_vnode_ops* vnodeOps, uint32 publishFlags)
264{
265	ASSERT(fInode != NULL && fTransaction != NULL);
266	Volume* volume = fTransaction->GetVolume();
267
268	status_t status = fInode->WriteBack(*fTransaction);
269	if (status < B_OK) {
270		FATAL(("writing new inode %" B_PRIdINO " failed!\n", fInode->ID()));
271		return status;
272	}
273
274	// Symbolic links are not published -- the caller needs to do this once
275	// the contents have been written.
276	if (!fInode->IsSymLink() && !volume->IsInitializing()
277		&& (publishFlags & BFS_DO_NOT_PUBLISH_VNODE) == 0) {
278		status = publish_vnode(volume->FSVolume(), fInode->ID(), fInode,
279			vnodeOps != NULL ? vnodeOps : &gBFSVnodeOps, fInode->Mode(),
280			publishFlags);
281	}
282
283	if (status == B_OK) {
284		cache_add_transaction_listener(volume->BlockCache(), fTransaction->ID(),
285			TRANSACTION_ABORTED, &_TransactionListener, fInode);
286	}
287
288	fTransaction = NULL;
289	fInode = NULL;
290
291	return status;
292}
293
294
295/*static*/ void
296InodeAllocator::_TransactionListener(int32 id, int32 event, void* _inode)
297{
298	Inode* inode = (Inode*)_inode;
299
300	if (event == TRANSACTION_ABORTED)
301		panic("transaction %d aborted, inode %p still around!\n", (int)id, inode);
302}
303
304
305//	#pragma mark -
306
307
308status_t
309bfs_inode::InitCheck(Volume* volume) const
310{
311	if (Magic1() != INODE_MAGIC1
312		|| !(Flags() & INODE_IN_USE)
313		|| inode_num.Length() != 1
314		// matches inode size?
315		|| (uint32)InodeSize() != volume->InodeSize()
316		// parent resides on disk?
317		|| parent.AllocationGroup() > int32(volume->AllocationGroups())
318		|| parent.AllocationGroup() < 0
319		|| parent.Start() > (1L << volume->AllocationGroupShift())
320		|| parent.Length() != 1
321		// attributes, too?
322		|| attributes.AllocationGroup() > int32(volume->AllocationGroups())
323		|| attributes.AllocationGroup() < 0
324		|| attributes.Start() > (1L << volume->AllocationGroupShift()))
325		RETURN_ERROR(B_BAD_DATA);
326
327	if (Flags() & INODE_DELETED)
328		return B_NOT_ALLOWED;
329
330	// TODO: Add some tests to check the integrity of the other stuff here,
331	// especially for the data_stream!
332
333	return B_OK;
334}
335
336
337//	#pragma mark - Inode
338
339
340Inode::Inode(Volume* volume, ino_t id)
341	:
342	fVolume(volume),
343	fID(id),
344	fTree(NULL),
345	fAttributes(NULL),
346	fCache(NULL),
347	fMap(NULL)
348{
349	PRINT(("Inode::Inode(volume = %p, id = %" B_PRIdINO ") @ %p\n",
350		volume, id, this));
351
352	rw_lock_init(&fLock, "bfs inode");
353	recursive_lock_init(&fSmallDataLock, "bfs inode small data");
354
355	if (UpdateNodeFromDisk() != B_OK) {
356		// TODO: the error code gets eaten
357		return;
358	}
359
360	// these two will help to maintain the indices
361	fOldSize = Size();
362	fOldLastModified = LastModified();
363
364	if (IsContainer())
365		fTree = new(std::nothrow) BPlusTree(this);
366	if (NeedsFileCache()) {
367		SetFileCache(file_cache_create(fVolume->ID(), ID(), Size()));
368		SetMap(file_map_create(volume->ID(), ID(), Size()));
369	}
370}
371
372
373Inode::Inode(Volume* volume, Transaction& transaction, ino_t id, mode_t mode,
374		block_run& run)
375	:
376	fVolume(volume),
377	fID(id),
378	fTree(NULL),
379	fAttributes(NULL),
380	fCache(NULL),
381	fMap(NULL)
382{
383	PRINT(("Inode::Inode(volume = %p, transaction = %p, id = %" B_PRIdINO
384		") @ %p\n", volume, &transaction, id, this));
385
386	rw_lock_init(&fLock, "bfs inode");
387	recursive_lock_init(&fSmallDataLock, "bfs inode small data");
388
389	NodeGetter node(volume);
390	status_t status = node.SetToWritable(transaction, this, true);
391	if (status != B_OK) {
392		FATAL(("Could not read inode block %" B_PRId64 ": %s!\n", BlockNumber(),
393			strerror(status)));
394		return;
395	}
396
397	memset(&fNode, 0, sizeof(bfs_inode));
398
399	// Initialize the bfs_inode structure -- it's not written back to disk
400	// here, because it will be done so already when the inode could be
401	// created completely.
402
403	Node().magic1 = HOST_ENDIAN_TO_BFS_INT32(INODE_MAGIC1);
404	Node().inode_num = run;
405	Node().mode = HOST_ENDIAN_TO_BFS_INT32(mode);
406	Node().flags = HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
407
408	Node().create_time = Node().last_modified_time = Node().status_change_time
409		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
410
411	Node().inode_size = HOST_ENDIAN_TO_BFS_INT32(volume->InodeSize());
412
413	// these two will help to maintain the indices
414	fOldSize = Size();
415	fOldLastModified = LastModified();
416}
417
418
419Inode::~Inode()
420{
421	PRINT(("Inode::~Inode() @ %p\n", this));
422
423	file_cache_delete(FileCache());
424	file_map_delete(Map());
425	delete fTree;
426
427	rw_lock_destroy(&fLock);
428	recursive_lock_destroy(&fSmallDataLock);
429}
430
431
432status_t
433Inode::InitCheck(bool checkNode) const
434{
435	// test inode magic and flags
436	if (checkNode) {
437		status_t status = Node().InitCheck(fVolume);
438		if (status == B_BUSY)
439			return B_BUSY;
440
441		if (status != B_OK) {
442			FATAL(("inode at block %" B_PRIdOFF " corrupt!\n", BlockNumber()));
443			RETURN_ERROR(B_BAD_DATA);
444		}
445	}
446
447	if (IsContainer()) {
448		// inodes that have a B+tree
449		if (fTree == NULL)
450			RETURN_ERROR(B_NO_MEMORY);
451
452		status_t status = fTree->InitCheck();
453		if (status != B_OK) {
454			FATAL(("inode tree at block %" B_PRIdOFF " corrupt!\n",
455				BlockNumber()));
456			RETURN_ERROR(B_BAD_DATA);
457		}
458	}
459
460	if (NeedsFileCache() && (fCache == NULL || fMap == NULL))
461		return B_NO_MEMORY;
462
463	return B_OK;
464}
465
466
467/*!	Adds this inode to the specified transaction. This means that the inode will
468	be write locked until the transaction ended.
469	To ensure that the inode will stay valid until that point, an extra reference
470	is acquired to it as long as this transaction stays active.
471*/
472void
473Inode::WriteLockInTransaction(Transaction& transaction)
474{
475	// These flags can only change while holding the transaction lock
476	if ((Flags() & INODE_IN_TRANSACTION) != 0)
477		return;
478
479	// We share the same list link with the removed list, so we have to remove
480	// the inode from that list here (and add it back when we no longer need it)
481	if ((Flags() & INODE_DELETED) != 0)
482		fVolume->RemovedInodes().Remove(this);
483
484	if (!fVolume->IsInitializing())
485		acquire_vnode(fVolume->FSVolume(), ID());
486
487	rw_lock_write_lock(&Lock());
488	Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
489
490	transaction.AddListener(this);
491}
492
493
494status_t
495Inode::WriteBack(Transaction& transaction)
496{
497	NodeGetter node(fVolume);
498	status_t status = node.SetToWritable(transaction, this);
499	if (status != B_OK)
500		return status;
501
502	memcpy(node.WritableNode(), &Node(), sizeof(bfs_inode));
503	return B_OK;
504}
505
506
507status_t
508Inode::UpdateNodeFromDisk()
509{
510	NodeGetter node(fVolume);
511	status_t status = node.SetTo(this);
512	if (status != B_OK) {
513		FATAL(("Failed to read block %" B_PRId64 " from disk: %s!\n",
514			BlockNumber(), strerror(status)));
515		return status;
516	}
517
518	memcpy(&fNode, node.Node(), sizeof(bfs_inode));
519	fNode.flags &= HOST_ENDIAN_TO_BFS_INT32(INODE_PERMANENT_FLAGS);
520	return B_OK;
521}
522
523
524status_t
525Inode::CheckPermissions(int accessMode) const
526{
527	// you never have write access to a read-only volume
528	if ((accessMode & W_OK) != 0 && fVolume->IsReadOnly())
529		return B_READ_ONLY_DEVICE;
530
531	return check_access_permissions(accessMode, Mode(), (gid_t)fNode.GroupID(),
532		(uid_t)fNode.UserID());
533}
534
535
536//	#pragma mark - attributes
537
538
539void
540Inode::_AddIterator(AttributeIterator* iterator)
541{
542	RecursiveLocker _(fSmallDataLock);
543	fIterators.Add(iterator);
544}
545
546
547void
548Inode::_RemoveIterator(AttributeIterator* iterator)
549{
550	RecursiveLocker _(fSmallDataLock);
551	fIterators.Remove(iterator);
552}
553
554
555/*!	Tries to free up "bytes" space in the small_data section by moving
556	attributes to real files. Used for system attributes like the name.
557	You need to hold the fSmallDataLock when you call this method
558*/
559status_t
560Inode::_MakeSpaceForSmallData(Transaction& transaction, bfs_inode* node,
561	const char* name, int32 bytes)
562{
563	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
564
565	while (bytes > 0) {
566		small_data* item = node->SmallDataStart();
567		small_data* max = NULL;
568		int32 index = 0, maxIndex = 0;
569		for (; !item->IsLast(node); item = item->Next(), index++) {
570			// should not remove those
571			if (*item->Name() == FILE_NAME_NAME || !strcmp(name, item->Name()))
572				continue;
573
574			if (max == NULL || max->Size() < item->Size()) {
575				maxIndex = index;
576				max = item;
577			}
578
579			// Remove the first one large enough to free the needed amount of
580			// bytes
581			if (bytes < (int32)item->Size())
582				break;
583		}
584
585		if (item->IsLast(node) || (int32)item->Size() < bytes || max == NULL)
586			return B_ERROR;
587
588		bytes -= max->Size();
589
590		// Move the attribute to a real attribute file
591		// Luckily, this doesn't cause any index updates
592
593		Inode* attribute;
594		status_t status = CreateAttribute(transaction, item->Name(),
595			item->Type(), &attribute);
596		if (status != B_OK)
597			RETURN_ERROR(status);
598
599		size_t length = item->DataSize();
600		status = attribute->WriteAt(transaction, 0, item->Data(), &length);
601
602		ReleaseAttribute(attribute);
603
604		if (status != B_OK) {
605			Vnode vnode(fVolume, Attributes());
606			Inode* attributes;
607			if (vnode.Get(&attributes) < B_OK
608				|| attributes->Remove(transaction, name) < B_OK) {
609				FATAL(("Could not remove newly created attribute!\n"));
610			}
611
612			RETURN_ERROR(status);
613		}
614
615		_RemoveSmallData(node, max, maxIndex);
616	}
617	return B_OK;
618}
619
620
621/*!	Private function which removes the given attribute from the small_data
622	section.
623	You need to hold the fSmallDataLock when you call this method
624*/
625status_t
626Inode::_RemoveSmallData(bfs_inode* node, small_data* item, int32 index)
627{
628	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
629
630	small_data* next = item->Next();
631	if (!next->IsLast(node)) {
632		// find the last attribute
633		small_data* last = next;
634		while (!last->IsLast(node))
635			last = last->Next();
636
637		int32 size = (uint8*)last - (uint8*)next;
638		if (size < 0
639			|| size > (uint8*)node + fVolume->BlockSize() - (uint8*)next)
640			return B_BAD_DATA;
641
642		memmove(item, next, size);
643
644		// Move the "last" one to its new location and
645		// correctly terminate the small_data section
646		last = (small_data*)((uint8*)last - ((uint8*)next - (uint8*)item));
647		memset(last, 0, (uint8*)node + fVolume->BlockSize() - (uint8*)last);
648	} else
649		memset(item, 0, item->Size());
650
651	// update all current iterators
652	SinglyLinkedList<AttributeIterator>::Iterator iterator
653		= fIterators.GetIterator();
654	while (iterator.HasNext()) {
655		iterator.Next()->Update(index, -1);
656	}
657
658	return B_OK;
659}
660
661
662//!	Removes the given attribute from the small_data section.
663status_t
664Inode::_RemoveSmallData(Transaction& transaction, NodeGetter& nodeGetter,
665	const char* name)
666{
667	if (name == NULL)
668		return B_BAD_VALUE;
669
670	bfs_inode* node = nodeGetter.WritableNode();
671	RecursiveLocker locker(fSmallDataLock);
672
673	// search for the small_data item
674
675	small_data* item = node->SmallDataStart();
676	int32 index = 0;
677	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
678		item = item->Next();
679		index++;
680	}
681
682	if (item->IsLast(node))
683		return B_ENTRY_NOT_FOUND;
684
685	status_t status = nodeGetter.MakeWritable(transaction);
686	if (status != B_OK)
687		return status;
688
689	status = _RemoveSmallData(node, item, index);
690	if (status == B_OK) {
691		Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
692			bfs_inode::ToInode(real_time_clock_usecs()));
693
694		status = WriteBack(transaction);
695	}
696
697	return status;
698}
699
700
701/*!	Try to place the given attribute in the small_data section - if the
702	new attribute is too big to fit in that section, it returns B_DEVICE_FULL.
703	In that case, the attribute should be written to a real attribute file;
704	it's the caller's responsibility to remove any existing attributes in the
705	small data section if that's the case.
706
707	Note that you need to write back the inode yourself after having called that
708	method - it's a bad API decision that it needs a transaction but enforces
709	you to write back the inode all by yourself, but it's just more efficient
710	in most cases...
711*/
712status_t
713Inode::_AddSmallData(Transaction& transaction, NodeGetter& nodeGetter,
714	const char* name, uint32 type, off_t pos, const uint8* data, size_t length,
715	bool force)
716{
717	bfs_inode* node = nodeGetter.WritableNode();
718
719	if (node == NULL || name == NULL || data == NULL)
720		return B_BAD_VALUE;
721
722	// reject any requests that can't fit into the small_data section
723	uint32 nameLength = strlen(name);
724	uint32 spaceNeeded = sizeof(small_data) + nameLength + 3 + pos + length + 1;
725	if (spaceNeeded > fVolume->InodeSize() - sizeof(bfs_inode))
726		return B_DEVICE_FULL;
727
728	status_t status = nodeGetter.MakeWritable(transaction);
729	if (status != B_OK)
730		return status;
731
732	RecursiveLocker locker(fSmallDataLock);
733
734	// Find the last item or one with the same name we have to add
735	small_data* item = node->SmallDataStart();
736	int32 index = 0;
737	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
738		item = item->Next();
739		index++;
740	}
741
742	// is the attribute already in the small_data section?
743	// then just replace the data part of that one
744	if (!item->IsLast(node)) {
745		// find last attribute
746		small_data* last = item;
747		while (!last->IsLast(node))
748			last = last->Next();
749
750		// try to change the attributes value
751		if (item->data_size > pos + length
752			|| force
753			|| ((uint8*)last + pos + length - item->DataSize())
754					<= ((uint8*)node + fVolume->InodeSize())) {
755			// Make room for the new attribute if needed (and we are forced
756			// to do so)
757			if (force && ((uint8*)last + pos + length - item->DataSize())
758					> ((uint8*)node + fVolume->InodeSize())) {
759				// We also take the free space at the end of the small_data
760				// section into account, and request only what's really needed
761				uint32 needed = pos + length - item->DataSize() -
762					(uint32)((uint8*)node + fVolume->InodeSize()
763						- (uint8*)last);
764
765				if (_MakeSpaceForSmallData(transaction, node, name, needed)
766						!= B_OK)
767					return B_ERROR;
768
769				// reset our pointers
770				item = node->SmallDataStart();
771				index = 0;
772				while (!item->IsLast(node) && strcmp(item->Name(), name)) {
773					item = item->Next();
774					index++;
775				}
776
777				last = item;
778				while (!last->IsLast(node))
779					last = last->Next();
780			}
781
782			size_t oldDataSize = item->DataSize();
783
784			// Normally, we can just overwrite the attribute data as the size
785			// is specified by the type and does not change that often
786			if (pos + length != item->DataSize()) {
787				// move the attributes after the current one
788				small_data* next = item->Next();
789				if (!next->IsLast(node)) {
790					memmove((uint8*)item + spaceNeeded, next,
791						(uint8*)last - (uint8*)next);
792				}
793
794				// Move the "last" one to its new location and
795				// correctly terminate the small_data section
796				last = (small_data*)((uint8*)last
797					- ((uint8*)next - ((uint8*)item + spaceNeeded)));
798				if ((uint8*)last < (uint8*)node + fVolume->BlockSize()) {
799					memset(last, 0, (uint8*)node + fVolume->BlockSize()
800						- (uint8*)last);
801				}
802
803				item->data_size = HOST_ENDIAN_TO_BFS_INT16(pos + length);
804			}
805
806			item->type = HOST_ENDIAN_TO_BFS_INT32(type);
807
808			if ((uint64)oldDataSize < (uint64)pos) {
809				// Fill gap with zeros
810				memset(item->Data() + oldDataSize, 0, pos - oldDataSize);
811			}
812			if (user_memcpy(item->Data() + pos, data, length) < B_OK)
813				return B_BAD_ADDRESS;
814			item->Data()[pos + length] = '\0';
815
816			return B_OK;
817		}
818
819		return B_DEVICE_FULL;
820	}
821
822	// try to add the new attribute!
823
824	if ((uint8*)item + spaceNeeded > (uint8*)node + fVolume->InodeSize()) {
825		// there is not enough space for it!
826		if (!force)
827			return B_DEVICE_FULL;
828
829		// make room for the new attribute
830		if (_MakeSpaceForSmallData(transaction, node, name, spaceNeeded) < B_OK)
831			return B_ERROR;
832
833		// get new last item!
834		item = node->SmallDataStart();
835		index = 0;
836		while (!item->IsLast(node)) {
837			item = item->Next();
838			index++;
839		}
840	}
841
842	memset(item, 0, spaceNeeded);
843	item->type = HOST_ENDIAN_TO_BFS_INT32(type);
844	item->name_size = HOST_ENDIAN_TO_BFS_INT16(nameLength);
845	item->data_size = HOST_ENDIAN_TO_BFS_INT16(length);
846	strcpy(item->Name(), name);
847	if (user_memcpy(item->Data() + pos, data, length) < B_OK)
848		return B_BAD_ADDRESS;
849
850	// correctly terminate the small_data section
851	item = item->Next();
852	if (!item->IsLast(node))
853		memset(item, 0, (uint8*)node + fVolume->InodeSize() - (uint8*)item);
854
855	// update all current iterators
856	SinglyLinkedList<AttributeIterator>::Iterator iterator
857		= fIterators.GetIterator();
858	while (iterator.HasNext()) {
859		iterator.Next()->Update(index, 1);
860	}
861
862	return B_OK;
863}
864
865
866/*!	Iterates through the small_data section of an inode.
867	To start at the beginning of this section, you let smallData
868	point to NULL, like:
869		small_data* data = NULL;
870		while (inode->GetNextSmallData(&data) { ... }
871
872	This function is reentrant and doesn't allocate any memory;
873	you can safely stop calling it at any point (you don't need
874	to iterate through the whole list).
875	You need to hold the fSmallDataLock when you call this method
876*/
877status_t
878Inode::_GetNextSmallData(bfs_inode* node, small_data** _smallData) const
879{
880	if (node == NULL)
881		RETURN_ERROR(B_BAD_VALUE);
882
883	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
884
885	small_data* data = *_smallData;
886
887	// begin from the start?
888	if (data == NULL)
889		data = node->SmallDataStart();
890	else
891		data = data->Next();
892
893	// is already last item?
894	if (data->IsLast(node))
895		return B_ENTRY_NOT_FOUND;
896
897	*_smallData = data;
898
899	return B_OK;
900}
901
902
903/*!	Finds the attribute "name" in the small data section, and
904	returns a pointer to it (or NULL if it doesn't exist).
905	You need to hold the fSmallDataLock when you call this method
906*/
907small_data*
908Inode::FindSmallData(const bfs_inode* node, const char* name) const
909{
910	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
911
912	small_data* smallData = NULL;
913	while (_GetNextSmallData(const_cast<bfs_inode*>(node), &smallData)
914			== B_OK) {
915		if (!strcmp(smallData->Name(), name))
916			return smallData;
917	}
918	return NULL;
919}
920
921
922/*!	Returns a pointer to the node's name if present in the small data
923	section, NULL otherwise.
924	You need to hold the fSmallDataLock when you call this method
925*/
926const char*
927Inode::Name(const bfs_inode* node) const
928{
929	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
930
931	small_data* smallData = NULL;
932	while (_GetNextSmallData((bfs_inode*)node, &smallData) == B_OK) {
933		if (*smallData->Name() == FILE_NAME_NAME
934			&& smallData->NameSize() == FILE_NAME_NAME_LENGTH)
935			return (const char*)smallData->Data();
936	}
937	return NULL;
938}
939
940
941/*!	Copies the node's name into the provided buffer.
942	The buffer should be B_FILE_NAME_LENGTH bytes large.
943*/
944status_t
945Inode::GetName(char* buffer, size_t size) const
946{
947	NodeGetter node(fVolume);
948	status_t status = node.SetTo(this);
949	if (status != B_OK)
950		return status;
951
952	RecursiveLocker locker(fSmallDataLock);
953
954	const char* name = Name(node.Node());
955	if (name == NULL)
956		return B_ENTRY_NOT_FOUND;
957
958	strlcpy(buffer, name, size);
959	return B_OK;
960}
961
962
963/*!	Changes or set the name of a file: in the inode small_data section only, it
964	doesn't change it in the parent directory's b+tree.
965	Note that you need to write back the inode yourself after having called
966	that method. It suffers from the same API decision as AddSmallData() does
967	(and for the same reason).
968*/
969status_t
970Inode::SetName(Transaction& transaction, const char* name)
971{
972	if (name == NULL || *name == '\0')
973		return B_BAD_VALUE;
974
975	NodeGetter node(fVolume);
976	status_t status = node.SetToWritable(transaction, this);
977	if (status != B_OK)
978		return status;
979
980	const char nameTag[2] = {FILE_NAME_NAME, 0};
981
982	return _AddSmallData(transaction, node, nameTag, FILE_NAME_TYPE, 0,
983		(uint8*)name, strlen(name), true);
984}
985
986
987status_t
988Inode::_RemoveAttribute(Transaction& transaction, const char* name,
989	bool hasIndex, Index* index)
990{
991	// remove the attribute file if it exists
992	Vnode vnode(fVolume, Attributes());
993	Inode* attributes;
994	status_t status = vnode.Get(&attributes);
995	if (status < B_OK)
996		return status;
997
998	// update index
999	if (index != NULL) {
1000		Inode* attribute;
1001		if ((hasIndex || fVolume->CheckForLiveQuery(name))
1002			&& GetAttribute(name, &attribute) == B_OK) {
1003			uint8 data[MAX_INDEX_KEY_LENGTH];
1004			size_t length = MAX_INDEX_KEY_LENGTH;
1005			if (attribute->ReadAt(0, data, &length) == B_OK) {
1006				index->Update(transaction, name, attribute->Type(), data,
1007					length, NULL, 0, this);
1008			}
1009
1010			ReleaseAttribute(attribute);
1011		}
1012	}
1013
1014	if ((status = attributes->Remove(transaction, name)) < B_OK)
1015		return status;
1016
1017	if (attributes->IsEmpty()) {
1018		attributes->WriteLockInTransaction(transaction);
1019
1020		// remove attribute directory (don't fail if that can't be done)
1021		if (remove_vnode(fVolume->FSVolume(), attributes->ID()) == B_OK) {
1022			// update the inode, so that no one will ever doubt it's deleted :-)
1023			attributes->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
1024			if (attributes->WriteBack(transaction) == B_OK) {
1025				Attributes().SetTo(0, 0, 0);
1026				WriteBack(transaction);
1027			} else {
1028				unremove_vnode(fVolume->FSVolume(), attributes->ID());
1029				attributes->Node().flags
1030					&= ~HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
1031			}
1032		}
1033	}
1034
1035	return status;
1036}
1037
1038
1039/*!	Reads data from the specified attribute.
1040	This is a high-level attribute function that understands attributes
1041	in the small_data section as well as real attribute files.
1042*/
1043status_t
1044Inode::ReadAttribute(const char* name, int32 type, off_t pos, uint8* buffer,
1045	size_t* _length)
1046{
1047	if (pos < 0)
1048		pos = 0;
1049
1050	// search in the small_data section (which has to be locked first)
1051	{
1052		NodeGetter node(fVolume);
1053		status_t status = node.SetTo(this);
1054		if (status != B_OK)
1055			return status;
1056
1057		RecursiveLocker locker(fSmallDataLock);
1058
1059		small_data* smallData = FindSmallData(node.Node(), name);
1060		if (smallData != NULL) {
1061			size_t length = *_length;
1062			if (pos >= smallData->data_size) {
1063				*_length = 0;
1064				return B_OK;
1065			}
1066			if (length + pos > smallData->DataSize())
1067				length = smallData->DataSize() - pos;
1068
1069			status_t error = user_memcpy(buffer, smallData->Data() + pos,
1070				length);
1071			*_length = length;
1072			return error;
1073		}
1074	}
1075
1076	// search in the attribute directory
1077	Inode* attribute;
1078	status_t status = GetAttribute(name, &attribute);
1079	if (status == B_OK) {
1080		status = attribute->ReadAt(pos, (uint8*)buffer, _length);
1081
1082		ReleaseAttribute(attribute);
1083	}
1084
1085	RETURN_ERROR(status);
1086}
1087
1088
1089/*!	Writes data to the specified attribute.
1090	This is a high-level attribute function that understands attributes
1091	in the small_data section as well as real attribute files.
1092*/
1093status_t
1094Inode::WriteAttribute(Transaction& transaction, const char* name, int32 type,
1095	off_t pos, const uint8* buffer, size_t* _length, bool* _created)
1096{
1097	if (pos < 0)
1098		return B_BAD_VALUE;
1099
1100	// needed to maintain the index
1101	uint8 oldBuffer[MAX_INDEX_KEY_LENGTH];
1102	uint8* oldData = NULL;
1103	size_t oldLength = 0;
1104	bool created = false;
1105
1106	// TODO: we actually depend on that the contents of "buffer" are constant.
1107	// If they get changed during the write (hey, user programs), we may mess
1108	// up our index trees!
1109	// TODO: for attribute files, we need to log the first
1110	// MAX_INDEX_KEY_LENGTH bytes of the data stream, or the same as above
1111	// might happen.
1112
1113	Index index(fVolume);
1114	bool hasIndex = index.SetTo(name) == B_OK;
1115
1116	Inode* attribute = NULL;
1117	status_t status = B_OK;
1118
1119	if (GetAttribute(name, &attribute) != B_OK) {
1120		// No attribute inode exists yet
1121
1122		// save the old attribute data
1123		NodeGetter node(fVolume);
1124		status = node.SetToWritable(transaction, this);
1125		if (status != B_OK)
1126			return status;
1127
1128		recursive_lock_lock(&fSmallDataLock);
1129
1130		small_data* smallData = FindSmallData(node.Node(), name);
1131		if (smallData != NULL) {
1132			oldLength = smallData->DataSize();
1133			if (oldLength > 0) {
1134				if (oldLength > MAX_INDEX_KEY_LENGTH)
1135					oldLength = MAX_INDEX_KEY_LENGTH;
1136				memcpy(oldData = oldBuffer, smallData->Data(), oldLength);
1137			}
1138		} else
1139			created = true;
1140
1141		recursive_lock_unlock(&fSmallDataLock);
1142
1143		// if the attribute doesn't exist yet (as a file), try to put it in the
1144		// small_data section first - if that fails (due to insufficent space),
1145		// create a real attribute file
1146		status = _AddSmallData(transaction, node, name, type, pos, buffer,
1147			*_length);
1148		if (status == B_DEVICE_FULL) {
1149			if (smallData != NULL) {
1150				// remove the old attribute from the small data section - there
1151				// is no space left for the new data
1152				status = _RemoveSmallData(transaction, node, name);
1153			} else
1154				status = B_OK;
1155
1156			if (status == B_OK)
1157				status = CreateAttribute(transaction, name, type, &attribute);
1158			if (status != B_OK)
1159				RETURN_ERROR(status);
1160
1161			created = true;
1162		} else if (status == B_OK) {
1163			// Update status time on attribute write
1164			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1165				bfs_inode::ToInode(real_time_clock_usecs()));
1166
1167			status = WriteBack(transaction);
1168		}
1169	}
1170
1171	if (attribute != NULL) {
1172		WriteLocker writeLocker(attribute->fLock);
1173
1174		if (hasIndex || fVolume->CheckForLiveQuery(name)) {
1175			// Save the old attribute data (if this fails, oldLength will
1176			// reflect it)
1177			while (attribute->Size() > 0) {
1178				bigtime_t oldModified = attribute->LastModified();
1179				writeLocker.Unlock();
1180
1181				oldLength = MAX_INDEX_KEY_LENGTH;
1182				if (attribute->ReadAt(0, oldBuffer, &oldLength) == B_OK)
1183					oldData = oldBuffer;
1184
1185				writeLocker.Lock();
1186
1187				// Read until the data hasn't changed in between
1188				if (oldModified == attribute->LastModified())
1189					break;
1190
1191				oldLength = 0;
1192			}
1193		}
1194
1195		// check if the data fits into the small_data section again
1196		NodeGetter node(fVolume);
1197		status = node.SetToWritable(transaction, this);
1198		if (status != B_OK)
1199			return status;
1200
1201		status = _AddSmallData(transaction, node, name, type, pos, buffer,
1202			*_length);
1203
1204		if (status == B_OK) {
1205			// it does - remove its file
1206			writeLocker.Unlock();
1207			status = _RemoveAttribute(transaction, name, false, NULL);
1208		} else {
1209			// The attribute type might have been changed - we need to
1210			// adopt the new one
1211			attribute->Node().type = HOST_ENDIAN_TO_BFS_INT32(type);
1212			status = attribute->WriteBack(transaction);
1213			writeLocker.Unlock();
1214
1215			if (status == B_OK) {
1216				status = attribute->WriteAt(transaction, pos, buffer,
1217					_length);
1218			}
1219		}
1220
1221		if (status == B_OK) {
1222			// Update status time on attribute write
1223			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1224				bfs_inode::ToInode(real_time_clock_usecs()));
1225
1226			status = WriteBack(transaction);
1227		}
1228
1229		attribute->WriteLockInTransaction(transaction);
1230		ReleaseAttribute(attribute);
1231	}
1232
1233	// TODO: find a better way than this "pos" thing (the begin of the old key
1234	//	must be copied to the start of the new one for a comparison)
1235	if (status == B_OK && pos == 0) {
1236		// Index only the first MAX_INDEX_KEY_LENGTH bytes
1237		uint16 length = *_length;
1238		if (length > MAX_INDEX_KEY_LENGTH)
1239			length = MAX_INDEX_KEY_LENGTH;
1240
1241		uint8 indexBuffer[MAX_INDEX_KEY_LENGTH];
1242		// _AddSmallData() already read the buffer
1243		user_memcpy(indexBuffer, buffer, length);
1244
1245		// Update index. Note, Index::Update() may be called even if
1246		// initializing the index failed - it will just update the live
1247		// queries in this case
1248		if (pos < length || (uint64)pos < (uint64)oldLength) {
1249			index.Update(transaction, name, type, oldData, oldLength,
1250				indexBuffer, length, this);
1251		}
1252	}
1253
1254	if (_created != NULL)
1255		*_created = created;
1256
1257	return status;
1258}
1259
1260
1261/*!	Removes the specified attribute from the inode.
1262	This is a high-level attribute function that understands attributes
1263	in the small_data section as well as real attribute files.
1264*/
1265status_t
1266Inode::RemoveAttribute(Transaction& transaction, const char* name)
1267{
1268	Index index(fVolume);
1269	bool hasIndex = index.SetTo(name) == B_OK;
1270	NodeGetter node(fVolume);
1271	status_t status = node.SetTo(this);
1272	if (status != B_OK)
1273		return status;
1274
1275	// update index for attributes in the small_data section
1276	{
1277		RecursiveLocker _(fSmallDataLock);
1278
1279		small_data* smallData = FindSmallData(node.Node(), name);
1280		if (smallData != NULL) {
1281			uint32 length = smallData->DataSize();
1282			if (length > MAX_INDEX_KEY_LENGTH)
1283				length = MAX_INDEX_KEY_LENGTH;
1284			index.Update(transaction, name, smallData->Type(),
1285				smallData->Data(), length, NULL, 0, this);
1286		}
1287	}
1288
1289	status = _RemoveSmallData(transaction, node, name);
1290	if (status == B_ENTRY_NOT_FOUND && !Attributes().IsZero()) {
1291		// remove the attribute file if it exists
1292		status = _RemoveAttribute(transaction, name, hasIndex, &index);
1293		if (status == B_OK) {
1294			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1295				bfs_inode::ToInode(real_time_clock_usecs()));
1296			WriteBack(transaction);
1297		}
1298	}
1299
1300	return status;
1301}
1302
1303
1304/*!	Returns the attribute inode with the specified \a name, in case it exists.
1305	This method can only return real attribute files; the attributes in the
1306	small data section are ignored.
1307*/
1308status_t
1309Inode::GetAttribute(const char* name, Inode** _attribute)
1310{
1311	// does this inode even have attributes?
1312	if (Attributes().IsZero())
1313		return B_ENTRY_NOT_FOUND;
1314
1315	Vnode vnode(fVolume, Attributes());
1316	Inode* attributes;
1317	if (vnode.Get(&attributes) < B_OK) {
1318		FATAL(("get_vnode() failed in Inode::GetAttribute(name = \"%s\")\n",
1319			name));
1320		return B_ERROR;
1321	}
1322
1323	BPlusTree* tree = attributes->Tree();
1324	if (tree == NULL)
1325		return B_BAD_VALUE;
1326
1327	InodeReadLocker locker(attributes);
1328
1329	ino_t id;
1330	status_t status = tree->Find((uint8*)name, (uint16)strlen(name), &id);
1331	if (status == B_OK) {
1332		Vnode vnode(fVolume, id);
1333		Inode* inode;
1334		// Check if the attribute is really an attribute
1335		if (vnode.Get(&inode) != B_OK || !inode->IsAttribute())
1336			return B_ERROR;
1337
1338		*_attribute = inode;
1339		vnode.Keep();
1340		return B_OK;
1341	}
1342
1343	return status;
1344}
1345
1346
1347void
1348Inode::ReleaseAttribute(Inode* attribute)
1349{
1350	if (attribute == NULL)
1351		return;
1352
1353	put_vnode(fVolume->FSVolume(), attribute->ID());
1354}
1355
1356
1357status_t
1358Inode::CreateAttribute(Transaction& transaction, const char* name, uint32 type,
1359	Inode** attribute)
1360{
1361	// do we need to create the attribute directory first?
1362	if (Attributes().IsZero()) {
1363		status_t status = Inode::Create(transaction, this, NULL,
1364			S_ATTR_DIR | S_DIRECTORY | 0666, 0, 0, NULL);
1365		if (status < B_OK)
1366			RETURN_ERROR(status);
1367	}
1368	Vnode vnode(fVolume, Attributes());
1369	Inode* attributes;
1370	if (vnode.Get(&attributes) < B_OK)
1371		return B_ERROR;
1372
1373	// Inode::Create() locks the inode for us
1374	return Inode::Create(transaction, attributes, name,
1375		S_ATTR | S_FILE | 0666, 0, type, NULL, NULL, attribute);
1376}
1377
1378
1379//	#pragma mark - directory tree
1380
1381
1382bool
1383Inode::IsEmpty()
1384{
1385	TreeIterator iterator(fTree);
1386
1387	uint32 count = 0;
1388	char name[MAX_INDEX_KEY_LENGTH + 1];
1389	uint16 length;
1390	ino_t id;
1391	while (iterator.GetNextEntry(name, &length, MAX_INDEX_KEY_LENGTH + 1,
1392			&id) == B_OK) {
1393		if ((Mode() & (S_ATTR_DIR | S_INDEX_DIR)) != 0)
1394			return false;
1395
1396		// Unlike index and attribute directories, directories
1397		// for standard files always contain ".", and "..", so
1398		// we need to ignore those two
1399		if (++count > 2 || (strcmp(".", name) != 0 && strcmp("..", name) != 0))
1400			return false;
1401	}
1402	return true;
1403}
1404
1405
1406status_t
1407Inode::ContainerContentsChanged(Transaction& transaction)
1408{
1409	ASSERT(!InLastModifiedIndex());
1410
1411	Node().last_modified_time = Node().status_change_time
1412		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
1413
1414	return WriteBack(transaction);
1415}
1416
1417
1418//	#pragma mark - data stream
1419
1420
1421/*!	Computes the number of bytes this inode occupies in the file system.
1422	This includes the file meta blocks used for maintaining its data stream.
1423
1424	TODO: However, the attributes in extra files are not really accounted for;
1425	depending on the speed penalty, this should be changed, though (the value
1426	could be cached in the inode structure or Inode object, though).
1427*/
1428off_t
1429Inode::AllocatedSize() const
1430{
1431	if (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0) {
1432		// This symlink does not have a data stream
1433		return Node().InodeSize();
1434	}
1435
1436	const data_stream& data = Node().data;
1437	uint32 blockSize = fVolume->BlockSize();
1438	off_t size = blockSize;
1439
1440	if (data.MaxDoubleIndirectRange() != 0) {
1441		off_t doubleIndirectSize = data.MaxDoubleIndirectRange()
1442			- data.MaxIndirectRange();
1443		int32 indirectSize = double_indirect_max_indirect_size(
1444			data.double_indirect.Length(), fVolume->BlockSize());
1445
1446		size += (2 * data.double_indirect.Length()
1447				+ doubleIndirectSize / indirectSize)
1448			* blockSize + data.MaxDoubleIndirectRange();
1449	} else if (data.MaxIndirectRange() != 0)
1450		size += data.indirect.Length() + data.MaxIndirectRange();
1451	else
1452		size += data.MaxDirectRange();
1453
1454	if (!Node().attributes.IsZero()) {
1455		// TODO: to make this exact, we'd had to count all attributes
1456		size += 2 * blockSize;
1457			// 2 blocks, one for the attributes inode, one for its B+tree
1458	}
1459
1460	return size;
1461}
1462
1463
1464/*!	Finds the block_run where "pos" is located in the data_stream of
1465	the inode.
1466	If successful, "offset" will then be set to the file offset
1467	of the block_run returned; so "pos - offset" is for the block_run
1468	what "pos" is for the whole stream.
1469	The caller has to make sure that "pos" is inside the stream.
1470*/
1471status_t
1472Inode::FindBlockRun(off_t pos, block_run& run, off_t& offset)
1473{
1474	data_stream* data = &Node().data;
1475
1476	// find matching block run
1477
1478	if (data->MaxIndirectRange() > 0 && pos >= data->MaxDirectRange()) {
1479		if (data->MaxDoubleIndirectRange() > 0
1480			&& pos >= data->MaxIndirectRange()) {
1481			// access to double indirect blocks
1482
1483			CachedBlock cached(fVolume);
1484
1485			int32 runsPerBlock;
1486			int32 directSize;
1487			int32 indirectSize;
1488			get_double_indirect_sizes(data->double_indirect.Length(),
1489				fVolume->BlockSize(), runsPerBlock, directSize, indirectSize);
1490			if (directSize <= 0 || indirectSize <= 0)
1491				RETURN_ERROR(B_BAD_DATA);
1492
1493			off_t start = pos - data->MaxIndirectRange();
1494			int32 index = start / indirectSize;
1495
1496			status_t status = cached.SetTo(fVolume->ToBlock(
1497				data->double_indirect) + index / runsPerBlock);
1498			if (status != B_OK)
1499				RETURN_ERROR(status);
1500
1501			block_run* indirect = (block_run*)cached.Block();
1502			int32 current = (start % indirectSize) / directSize;
1503
1504			status = cached.SetTo(fVolume->ToBlock(indirect[
1505				index % runsPerBlock]) + current / runsPerBlock);
1506			if (status != B_OK)
1507				RETURN_ERROR(status);
1508
1509			indirect = (block_run*)cached.Block();
1510			run = indirect[current % runsPerBlock];
1511			if (run.Length() != data->double_indirect.Length())
1512				RETURN_ERROR(B_BAD_DATA);
1513
1514			offset = data->MaxIndirectRange() + (index * indirectSize)
1515				+ (current * directSize);
1516		} else {
1517			// access to indirect blocks
1518
1519			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1520			off_t runBlockEnd = data->MaxDirectRange();
1521
1522			CachedBlock cached(fVolume);
1523			off_t block = fVolume->ToBlock(data->indirect);
1524
1525			for (int32 i = 0; i < data->indirect.Length(); i++) {
1526				status_t status = cached.SetTo(block + i);
1527				if (status != B_OK)
1528					RETURN_ERROR(status);
1529
1530				block_run* indirect = (block_run*)cached.Block();
1531				int32 current = -1;
1532				while (++current < runsPerBlock) {
1533					if (indirect[current].IsZero())
1534						break;
1535
1536					runBlockEnd += (uint32)indirect[current].Length()
1537						<< cached.BlockShift();
1538					if (runBlockEnd > pos) {
1539						run = indirect[current];
1540						offset = runBlockEnd - ((uint32)run.Length()
1541							<< cached.BlockShift());
1542						return fVolume->ValidateBlockRun(run);
1543					}
1544				}
1545			}
1546			RETURN_ERROR(B_ERROR);
1547		}
1548	} else {
1549		// access from direct blocks
1550
1551		off_t runBlockEnd = 0LL;
1552		int32 current = -1;
1553
1554		while (++current < NUM_DIRECT_BLOCKS) {
1555			if (data->direct[current].IsZero())
1556				break;
1557
1558			runBlockEnd += (uint32)data->direct[current].Length()
1559				<< fVolume->BlockShift();
1560			if (runBlockEnd > pos) {
1561				run = data->direct[current];
1562				offset = runBlockEnd
1563					- ((uint32)run.Length() << fVolume->BlockShift());
1564				return fVolume->ValidateBlockRun(run);
1565			}
1566		}
1567
1568		return B_ENTRY_NOT_FOUND;
1569	}
1570	return fVolume->ValidateBlockRun(run);
1571}
1572
1573
1574status_t
1575Inode::ReadAt(off_t pos, uint8* buffer, size_t* _length)
1576{
1577	return file_cache_read(FileCache(), NULL, pos, buffer, _length);
1578}
1579
1580
1581status_t
1582Inode::WriteAt(Transaction& transaction, off_t pos, const uint8* buffer,
1583	size_t* _length)
1584{
1585	InodeReadLocker locker(this);
1586
1587	// update the last modification time in memory, it will be written
1588	// back to the inode, and the index when the file is closed
1589	// TODO: should update the internal last modified time only at this point!
1590	Node().last_modified_time = Node().status_change_time
1591		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
1592
1593	// TODO: support INODE_LOGGED!
1594
1595	size_t length = *_length;
1596	bool changeSize = (uint64)pos + (uint64)length > (uint64)Size();
1597
1598	// set/check boundaries for pos/length
1599	if (pos < 0)
1600		return B_BAD_VALUE;
1601
1602	locker.Unlock();
1603
1604	// the transaction doesn't have to be started already
1605	if (changeSize && !transaction.IsStarted())
1606		transaction.Start(fVolume, BlockNumber());
1607
1608	WriteLocker writeLocker(fLock);
1609
1610	// Work around possible race condition: Someone might have shrunken the file
1611	// while we had no lock.
1612	if (!transaction.IsStarted()
1613		&& (uint64)pos + (uint64)length > (uint64)Size()) {
1614		writeLocker.Unlock();
1615		transaction.Start(fVolume, BlockNumber());
1616		writeLocker.Lock();
1617	}
1618
1619	off_t oldSize = Size();
1620
1621	if ((uint64)pos + (uint64)length > (uint64)oldSize) {
1622		// let's grow the data stream to the size needed
1623		status_t status = SetFileSize(transaction, pos + length);
1624		if (status != B_OK) {
1625			*_length = 0;
1626			WriteLockInTransaction(transaction);
1627			RETURN_ERROR(status);
1628		}
1629		// TODO: In theory we would need to update the file size
1630		// index here as part of the current transaction - this might just
1631		// be a bit too expensive, but worth a try.
1632
1633		// we need to write back the inode here because it has to
1634		// go into this transaction (we cannot wait until the file
1635		// is closed)
1636		status = WriteBack(transaction);
1637		if (status != B_OK) {
1638			WriteLockInTransaction(transaction);
1639			return status;
1640		}
1641	}
1642
1643	writeLocker.Unlock();
1644
1645	if (oldSize < pos)
1646		FillGapWithZeros(oldSize, pos);
1647
1648	// If we don't want to write anything, we can now return (we may
1649	// just have changed the file size using the position parameter)
1650	if (length == 0)
1651		return B_OK;
1652
1653	status_t status = file_cache_write(FileCache(), NULL, pos, buffer, _length);
1654
1655	if (transaction.IsStarted())
1656		WriteLockInTransaction(transaction);
1657
1658	return status;
1659}
1660
1661
1662/*!	Fills the gap between the old file size and the new file size
1663	with zeros.
1664	It's more or less a copy of Inode::WriteAt() but it can handle
1665	length differences of more than just 4 GB, and it never uses
1666	the log, even if the INODE_LOGGED flag is set.
1667*/
1668status_t
1669Inode::FillGapWithZeros(off_t pos, off_t newSize)
1670{
1671	while (pos < newSize) {
1672		size_t size;
1673		if (newSize > pos + 1024 * 1024 * 1024)
1674			size = 1024 * 1024 * 1024;
1675		else
1676			size = newSize - pos;
1677
1678		status_t status = file_cache_write(FileCache(), NULL, pos, NULL, &size);
1679		if (status < B_OK)
1680			return status;
1681
1682		pos += size;
1683	}
1684
1685	return B_OK;
1686}
1687
1688
1689/*!	Allocates \a length blocks, and clears their contents. Growing
1690	the indirect and double indirect range uses this method.
1691	The allocated block_run is saved in "run"
1692*/
1693status_t
1694Inode::_AllocateBlockArray(Transaction& transaction, block_run& run,
1695	size_t length, bool variableSize)
1696{
1697	if (!run.IsZero())
1698		return B_BAD_VALUE;
1699
1700	status_t status = fVolume->Allocate(transaction, this, length, run,
1701		variableSize ? 1 : length);
1702	if (status != B_OK)
1703		return status;
1704
1705	// make sure those blocks are empty
1706	CachedBlock cached(fVolume);
1707	off_t block = fVolume->ToBlock(run);
1708
1709	for (int32 i = 0; i < run.Length(); i++) {
1710		status = cached.SetToWritable(transaction, block + i, true);
1711		if (status != B_OK)
1712			return status;
1713	}
1714	return B_OK;
1715}
1716
1717
1718/*!	Grows the stream to \a size, and fills the direct/indirect/double indirect
1719	ranges with the runs.
1720	This method will also determine the size of the preallocation, if any.
1721*/
1722status_t
1723Inode::_GrowStream(Transaction& transaction, off_t size)
1724{
1725	data_stream* data = &Node().data;
1726
1727	// is the data stream already large enough to hold the new size?
1728	// (can be the case with preallocated blocks)
1729	if (size < data->MaxDirectRange()
1730		|| size < data->MaxIndirectRange()
1731		|| size < data->MaxDoubleIndirectRange()) {
1732		data->size = HOST_ENDIAN_TO_BFS_INT64(size);
1733		return B_OK;
1734	}
1735
1736	// how many bytes are still needed? (unused ranges are always zero)
1737	uint16 minimum = 1;
1738	off_t bytes;
1739	if (data->Size() < data->MaxDoubleIndirectRange()) {
1740		bytes = size - data->MaxDoubleIndirectRange();
1741		// The double indirect range can only handle multiples of
1742		// its base length
1743		minimum = data->double_indirect.Length();
1744	} else if (data->Size() < data->MaxIndirectRange())
1745		bytes = size - data->MaxIndirectRange();
1746	else if (data->Size() < data->MaxDirectRange())
1747		bytes = size - data->MaxDirectRange();
1748	else {
1749		// no preallocation left to be used
1750		bytes = size - data->Size();
1751		if (data->MaxDoubleIndirectRange() > 0)
1752			minimum = data->double_indirect.Length();
1753	}
1754
1755	// do we have enough free blocks on the disk?
1756	off_t blocksNeeded = (bytes + fVolume->BlockSize() - 1)
1757		>> fVolume->BlockShift();
1758	if (blocksNeeded > fVolume->FreeBlocks())
1759		return B_DEVICE_FULL;
1760
1761	off_t blocksRequested = blocksNeeded;
1762		// because of preallocations and partial allocations, the number of
1763		// blocks we need to allocate may be different from the one we request
1764		// from the block allocator
1765
1766	// Should we preallocate some blocks?
1767	// Attributes, attribute directories, and long symlinks usually won't get
1768	// that big, and should stay close to the inode - preallocating could be
1769	// counterproductive.
1770	// Also, if free disk space is tight, don't preallocate.
1771	if (!IsAttribute() && !IsAttributeDirectory() && !IsSymLink()
1772		&& fVolume->FreeBlocks() > 128) {
1773		off_t roundTo = 0;
1774		if (IsFile()) {
1775			// Request preallocated blocks depending on the file size and growth
1776			if (size < 1 * 1024 * 1024 && bytes < 512 * 1024) {
1777				// Preallocate 64 KB for file sizes <1 MB and grow rates <512 KB
1778				roundTo = 65536 >> fVolume->BlockShift();
1779			} else if (size < 32 * 1024 * 1024 && bytes <= 1 * 1024 * 1024) {
1780				// Preallocate 512 KB for file sizes between 1 MB and 32 MB, and
1781				// grow rates smaller than 1 MB
1782				roundTo = (512 * 1024) >> fVolume->BlockShift();
1783			} else {
1784				// Preallocate 1/16 of the file size (ie. 4 MB for 64 MB,
1785				// 64 MB for 1 GB)
1786				roundTo = size >> (fVolume->BlockShift() + 4);
1787			}
1788		} else if (IsIndex()) {
1789			// Always preallocate 64 KB for index directories
1790			roundTo = 65536 >> fVolume->BlockShift();
1791		} else {
1792			// Preallocate only 4 KB - directories only get trimmed when their
1793			// vnode is flushed, which might not happen very often.
1794			roundTo = 4096 >> fVolume->BlockShift();
1795		}
1796		if (roundTo > 1) {
1797			// Round to next "roundTo" block count
1798			blocksRequested = ((blocksNeeded + roundTo) / roundTo) * roundTo;
1799		}
1800	}
1801
1802	while (blocksNeeded > 0) {
1803		// the requested blocks do not need to be returned with a
1804		// single allocation, so we need to iterate until we have
1805		// enough blocks allocated
1806		if (minimum > 1) {
1807			// make sure that "blocks" is a multiple of minimum
1808			blocksRequested = round_up(blocksRequested, minimum);
1809		}
1810
1811		block_run run;
1812		status_t status = fVolume->Allocate(transaction, this, blocksRequested,
1813			run, minimum);
1814		if (status != B_OK)
1815			return status;
1816
1817		// okay, we have the needed blocks, so just distribute them to the
1818		// different ranges of the stream (direct, indirect & double indirect)
1819
1820		blocksNeeded -= run.Length();
1821		// don't preallocate if the first allocation was already too small
1822		blocksRequested = blocksNeeded;
1823
1824		// Direct block range
1825
1826		if (data->Size() <= data->MaxDirectRange()) {
1827			// let's try to put them into the direct block range
1828			int32 free = 0;
1829			for (; free < NUM_DIRECT_BLOCKS; free++) {
1830				if (data->direct[free].IsZero())
1831					break;
1832			}
1833
1834			if (free < NUM_DIRECT_BLOCKS) {
1835				// can we merge the last allocated run with the new one?
1836				int32 last = free - 1;
1837				if (free > 0 && data->direct[last].MergeableWith(run)) {
1838					data->direct[last].length = HOST_ENDIAN_TO_BFS_INT16(
1839						data->direct[last].Length() + run.Length());
1840				} else
1841					data->direct[free] = run;
1842
1843				data->max_direct_range = HOST_ENDIAN_TO_BFS_INT64(
1844					data->MaxDirectRange()
1845					+ run.Length() * fVolume->BlockSize());
1846				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1847					? data->max_direct_range : size);
1848				continue;
1849			}
1850		}
1851
1852		// Indirect block range
1853
1854		if (data->Size() <= data->MaxIndirectRange()
1855			|| !data->MaxIndirectRange()) {
1856			CachedBlock cached(fVolume);
1857			block_run* runs = NULL;
1858			uint32 free = 0;
1859			off_t block;
1860
1861			// if there is no indirect block yet, create one
1862			if (data->indirect.IsZero()) {
1863				status = _AllocateBlockArray(transaction, data->indirect,
1864					NUM_ARRAY_BLOCKS, true);
1865				if (status != B_OK)
1866					return status;
1867
1868				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1869					data->MaxDirectRange());
1870				// insert the block_run in the first block
1871				status = cached.SetTo(data->indirect);
1872				if (status != B_OK)
1873					return status;
1874
1875				runs = (block_run*)cached.Block();
1876			} else {
1877				uint32 numberOfRuns = fVolume->BlockSize() / sizeof(block_run);
1878				block = fVolume->ToBlock(data->indirect);
1879
1880				// search first empty entry
1881				int32 i = 0;
1882				for (; i < data->indirect.Length(); i++) {
1883					status = cached.SetTo(block + i);
1884					if (status != B_OK)
1885						return status;
1886
1887					runs = (block_run*)cached.Block();
1888					for (free = 0; free < numberOfRuns; free++)
1889						if (runs[free].IsZero())
1890							break;
1891
1892					if (free < numberOfRuns)
1893						break;
1894				}
1895				if (i == data->indirect.Length())
1896					runs = NULL;
1897			}
1898
1899			if (runs != NULL) {
1900				// try to insert the run to the last one - note that this
1901				// doesn't take block borders into account, so it could be
1902				// further optimized
1903				cached.MakeWritable(transaction);
1904
1905				int32 last = free - 1;
1906				if (free > 0 && runs[last].MergeableWith(run)) {
1907					runs[last].length = HOST_ENDIAN_TO_BFS_INT16(
1908						runs[last].Length() + run.Length());
1909				} else
1910					runs[free] = run;
1911
1912				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1913					data->MaxIndirectRange()
1914					+ ((uint32)run.Length() << fVolume->BlockShift()));
1915				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1916					? data->MaxIndirectRange() : size);
1917				continue;
1918			}
1919		}
1920
1921		// Double indirect block range
1922
1923		if (data->Size() <= data->MaxDoubleIndirectRange()
1924			|| !data->max_double_indirect_range) {
1925			// We make sure here that we have this minimum allocated, so if
1926			// the allocation succeeds, we don't run into an endless loop.
1927			if (!data->max_double_indirect_range)
1928				minimum = _DoubleIndirectBlockLength();
1929			else
1930				minimum = data->double_indirect.Length();
1931
1932			if ((run.Length() % minimum) != 0) {
1933				// The number of allocated blocks isn't a multiple of 'minimum',
1934				// so we have to change this. This can happen the first time the
1935				// stream grows into the double indirect range.
1936				// First, free the remaining blocks that don't fit into this
1937				// multiple.
1938				int32 rest = run.Length() % minimum;
1939				run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length() - rest);
1940
1941				status = fVolume->Free(transaction,
1942					block_run::Run(run.AllocationGroup(),
1943					run.Start() + run.Length(), rest));
1944				if (status != B_OK)
1945					return status;
1946
1947				blocksNeeded += rest;
1948				blocksRequested = round_up(blocksNeeded, minimum);
1949
1950				// Are there any blocks left in the run? If not, allocate
1951				// a new one
1952				if (run.length == 0)
1953					continue;
1954			}
1955
1956			// if there is no double indirect block yet, create one
1957			if (data->double_indirect.IsZero()) {
1958				status = _AllocateBlockArray(transaction,
1959					data->double_indirect, _DoubleIndirectBlockLength());
1960				if (status != B_OK)
1961					return status;
1962
1963				data->max_double_indirect_range = data->max_indirect_range;
1964			}
1965
1966			// calculate the index where to insert the new blocks
1967
1968			int32 runsPerBlock;
1969			int32 directSize;
1970			int32 indirectSize;
1971			get_double_indirect_sizes(data->double_indirect.Length(),
1972				fVolume->BlockSize(), runsPerBlock, directSize, indirectSize);
1973			if (directSize <= 0 || indirectSize <= 0)
1974				return B_BAD_DATA;
1975
1976			off_t start = data->MaxDoubleIndirectRange()
1977				- data->MaxIndirectRange();
1978			int32 indirectIndex = start / indirectSize;
1979			int32 index = (start % indirectSize) / directSize;
1980			int32 runsPerArray = runsPerBlock * minimum;
1981
1982			// distribute the blocks to the array and allocate
1983			// new array blocks when needed
1984
1985			CachedBlock cached(fVolume);
1986			CachedBlock cachedDirect(fVolume);
1987			block_run* array = NULL;
1988			uint32 runLength = run.Length();
1989
1990			while (run.length != 0) {
1991				// get the indirect array block
1992				if (array == NULL) {
1993					uint32 block = indirectIndex / runsPerBlock;
1994					if (block >= minimum)
1995						return EFBIG;
1996
1997					status = cached.SetTo(fVolume->ToBlock(
1998						data->double_indirect) + block);
1999					if (status != B_OK)
2000						return status;
2001
2002					array = (block_run*)cached.Block();
2003				}
2004
2005				do {
2006					// do we need a new array block?
2007					if (array[indirectIndex % runsPerBlock].IsZero()) {
2008						cached.MakeWritable(transaction);
2009
2010						status = _AllocateBlockArray(transaction,
2011							array[indirectIndex % runsPerBlock],
2012							data->double_indirect.Length());
2013						if (status != B_OK)
2014							return status;
2015					}
2016
2017					status = cachedDirect.SetToWritable(transaction,
2018						fVolume->ToBlock(array[indirectIndex
2019							% runsPerBlock]) + index / runsPerBlock);
2020					if (status != B_OK)
2021						return status;
2022
2023					block_run* runs = (block_run*)cachedDirect.Block();
2024
2025					do {
2026						// insert the block_run into the array
2027						runs[index % runsPerBlock] = run;
2028						runs[index % runsPerBlock].length
2029							= HOST_ENDIAN_TO_BFS_INT16(minimum);
2030
2031						// alter the remaining block_run
2032						run.start = HOST_ENDIAN_TO_BFS_INT16(run.Start()
2033							+ minimum);
2034						run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
2035							- minimum);
2036					} while ((++index % runsPerBlock) != 0 && run.length);
2037				} while ((index % runsPerArray) != 0 && run.length);
2038
2039				if (index == runsPerArray)
2040					index = 0;
2041				if (++indirectIndex % runsPerBlock == 0) {
2042					array = NULL;
2043					index = 0;
2044				}
2045			}
2046
2047			data->max_double_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
2048				data->MaxDoubleIndirectRange()
2049				+ (runLength << fVolume->BlockShift()));
2050			data->size = blocksNeeded > 0 ? HOST_ENDIAN_TO_BFS_INT64(
2051				data->max_double_indirect_range) : size;
2052
2053			continue;
2054		}
2055
2056		RETURN_ERROR(EFBIG);
2057	}
2058	// update the size of the data stream
2059	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
2060
2061	return B_OK;
2062}
2063
2064
2065size_t
2066Inode::_DoubleIndirectBlockLength() const
2067{
2068	if (fVolume->BlockSize() > DOUBLE_INDIRECT_ARRAY_SIZE)
2069		return 1;
2070
2071	return DOUBLE_INDIRECT_ARRAY_SIZE / fVolume->BlockSize();
2072}
2073
2074
2075/*!	Frees the statically sized array of the double indirect part of a data
2076	stream.
2077*/
2078status_t
2079Inode::_FreeStaticStreamArray(Transaction& transaction, int32 level,
2080	block_run run, off_t size, off_t offset, off_t& max)
2081{
2082	int32 indirectSize;
2083	if (level == 0) {
2084		indirectSize = double_indirect_max_indirect_size(run.Length(),
2085			fVolume->BlockSize());
2086	} else {
2087		indirectSize = double_indirect_max_direct_size(run.Length(),
2088			fVolume->BlockSize());
2089	}
2090	if (indirectSize <= 0)
2091		return B_BAD_DATA;
2092
2093	off_t start;
2094	if (size > offset)
2095		start = size - offset;
2096	else
2097		start = 0;
2098
2099	int32 index = start / indirectSize;
2100	int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
2101
2102	CachedBlock cached(fVolume);
2103	off_t blockNumber = fVolume->ToBlock(run);
2104
2105	// set the file offset to the current block run
2106	offset += (off_t)index * indirectSize;
2107
2108	for (int32 i = index / runsPerBlock; i < run.Length(); i++) {
2109		status_t status = cached.SetToWritable(transaction, blockNumber + i);
2110		if (status != B_OK)
2111			RETURN_ERROR(status);
2112
2113		block_run* array = (block_run*)cached.WritableBlock();
2114
2115		for (index = index % runsPerBlock; index < runsPerBlock; index++) {
2116			if (array[index].IsZero()) {
2117				// we also want to break out of the outer loop
2118				i = run.Length();
2119				break;
2120			}
2121
2122			status_t status = B_OK;
2123			if (level == 0) {
2124				status = _FreeStaticStreamArray(transaction, 1, array[index],
2125					size, offset, max);
2126			} else if (offset >= size)
2127				status = fVolume->Free(transaction, array[index]);
2128			else
2129				max = HOST_ENDIAN_TO_BFS_INT64(offset + indirectSize);
2130
2131			if (status < B_OK)
2132				RETURN_ERROR(status);
2133
2134			if (offset >= size)
2135				array[index].SetTo(0, 0, 0);
2136
2137			offset += indirectSize;
2138		}
2139		index = 0;
2140	}
2141	return B_OK;
2142}
2143
2144
2145/*!	Frees all block_runs in the array which come after the specified size.
2146	It also trims the last block_run that contain the size.
2147	"offset" and "max" are maintained until the last block_run that doesn't
2148	have to be freed - after this, the values won't be correct anymore, but
2149	will still assure correct function for all subsequent calls.
2150	"max" is considered to be in file system byte order.
2151*/
2152status_t
2153Inode::_FreeStreamArray(Transaction& transaction, block_run* array,
2154	uint32 arrayLength, off_t size, off_t& offset, off_t& max)
2155{
2156	PRINT(("FreeStreamArray: arrayLength %" B_PRId32 ", size %" B_PRIdOFF
2157		", offset %" B_PRIdOFF ", max %" B_PRIdOFF "\n", arrayLength, size,
2158		offset, max));
2159
2160	off_t newOffset = offset;
2161	uint32 i = 0;
2162	for (; i < arrayLength; i++, offset = newOffset) {
2163		if (array[i].IsZero())
2164			break;
2165
2166		newOffset += (off_t)array[i].Length() << fVolume->BlockShift();
2167		if (newOffset <= size)
2168			continue;
2169
2170		block_run run = array[i];
2171
2172		// determine the block_run to be freed
2173		if (newOffset > size && offset < size) {
2174			// free partial block_run (and update the original block_run)
2175			run.start = HOST_ENDIAN_TO_BFS_INT16(array[i].Start()
2176				+ ((size + fVolume->BlockSize() - 1 - offset)
2177					>> fVolume->BlockShift()));
2178				// round to next block
2179			array[i].length = HOST_ENDIAN_TO_BFS_INT16(run.Start()
2180				- array[i].Start());
2181			run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
2182				- array[i].Length());
2183
2184			if (run.length == 0)
2185				continue;
2186
2187			// update maximum range
2188			max = HOST_ENDIAN_TO_BFS_INT64(offset + ((off_t)array[i].Length()
2189				<< fVolume->BlockShift()));
2190		} else {
2191			// free the whole block_run
2192			array[i].SetTo(0, 0, 0);
2193
2194			if ((off_t)BFS_ENDIAN_TO_HOST_INT64(max) > offset)
2195				max = HOST_ENDIAN_TO_BFS_INT64(offset);
2196		}
2197
2198		if (fVolume->Free(transaction, run) < B_OK)
2199			return B_IO_ERROR;
2200	}
2201	return B_OK;
2202}
2203
2204
2205status_t
2206Inode::_ShrinkStream(Transaction& transaction, off_t size)
2207{
2208	data_stream* data = &Node().data;
2209	status_t status;
2210
2211	if (data->MaxDoubleIndirectRange() > size) {
2212		off_t* maxDoubleIndirect = &data->max_double_indirect_range;
2213			// gcc 4 work-around: "error: cannot bind packed field
2214			// 'data->data_stream::max_double_indirect_range' to 'off_t&'"
2215		status = _FreeStaticStreamArray(transaction, 0, data->double_indirect,
2216			size, data->MaxIndirectRange(), *maxDoubleIndirect);
2217		if (status != B_OK)
2218			return status;
2219
2220		if (size <= data->MaxIndirectRange()) {
2221			fVolume->Free(transaction, data->double_indirect);
2222			data->double_indirect.SetTo(0, 0, 0);
2223			data->max_double_indirect_range = 0;
2224		}
2225	}
2226
2227	if (data->MaxIndirectRange() > size) {
2228		CachedBlock cached(fVolume);
2229		off_t block = fVolume->ToBlock(data->indirect);
2230		off_t offset = data->MaxDirectRange();
2231
2232		for (int32 i = 0; i < data->indirect.Length(); i++) {
2233			status = cached.SetToWritable(transaction, block + i);
2234			if (status != B_OK)
2235				return status;
2236
2237			block_run* array = (block_run*)cached.WritableBlock();
2238
2239			off_t* maxIndirect = &data->max_indirect_range;
2240				// gcc 4 work-around: "error: cannot bind packed field
2241				// 'data->data_stream::max_indirect_range' to 'off_t&'"
2242			if (_FreeStreamArray(transaction, array, fVolume->BlockSize()
2243					/ sizeof(block_run), size, offset, *maxIndirect) != B_OK)
2244				return B_IO_ERROR;
2245		}
2246		if (data->max_direct_range == data->max_indirect_range) {
2247			fVolume->Free(transaction, data->indirect);
2248			data->indirect.SetTo(0, 0, 0);
2249			data->max_indirect_range = 0;
2250		}
2251	}
2252
2253	if (data->MaxDirectRange() > size) {
2254		off_t offset = 0;
2255		off_t *maxDirect = &data->max_direct_range;
2256			// gcc 4 work-around: "error: cannot bind packed field
2257			// 'data->data_stream::max_direct_range' to 'off_t&'"
2258		status = _FreeStreamArray(transaction, data->direct, NUM_DIRECT_BLOCKS,
2259			size, offset, *maxDirect);
2260		if (status != B_OK)
2261			return status;
2262	}
2263
2264	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
2265	return B_OK;
2266}
2267
2268
2269status_t
2270Inode::SetFileSize(Transaction& transaction, off_t size)
2271{
2272	if (size < 0)
2273		return B_BAD_VALUE;
2274
2275	off_t oldSize = Size();
2276
2277	if (size == oldSize)
2278		return B_OK;
2279
2280	T(Resize(this, oldSize, size, false));
2281
2282	// should the data stream grow or shrink?
2283	status_t status;
2284	if (size > oldSize) {
2285		status = _GrowStream(transaction, size);
2286		if (status < B_OK) {
2287			// if the growing of the stream fails, the whole operation
2288			// fails, so we should shrink the stream to its former size
2289			_ShrinkStream(transaction, oldSize);
2290		}
2291	} else
2292		status = _ShrinkStream(transaction, size);
2293
2294	if (status < B_OK)
2295		return status;
2296
2297	file_cache_set_size(FileCache(), size);
2298	file_map_set_size(Map(), size);
2299
2300	return WriteBack(transaction);
2301}
2302
2303
2304status_t
2305Inode::Append(Transaction& transaction, off_t bytes)
2306{
2307	return SetFileSize(transaction, Size() + bytes);
2308}
2309
2310
2311/*!	Checks whether or not this inode's data stream needs to be trimmed
2312	because of an earlier preallocation.
2313	Returns true if there are any blocks to be trimmed.
2314*/
2315bool
2316Inode::NeedsTrimming() const
2317{
2318	// We never trim preallocated index blocks to make them grow as smooth as
2319	// possible. There are only few indices anyway, so this doesn't hurt.
2320	// Also, if an inode is already in deleted state, we don't bother trimming
2321	// it.
2322	if (IsIndex() || IsDeleted()
2323		|| (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0))
2324		return false;
2325
2326	off_t roundedSize = round_up(Size(), fVolume->BlockSize());
2327
2328	return Node().data.MaxDirectRange() > roundedSize
2329		|| Node().data.MaxIndirectRange() > roundedSize
2330		|| Node().data.MaxDoubleIndirectRange() > roundedSize;
2331}
2332
2333
2334status_t
2335Inode::TrimPreallocation(Transaction& transaction)
2336{
2337	T(Resize(this, max_c(Node().data.MaxDirectRange(),
2338		Node().data.MaxIndirectRange()), Size(), true));
2339
2340	status_t status = _ShrinkStream(transaction, Size());
2341	if (status < B_OK)
2342		return status;
2343
2344	return WriteBack(transaction);
2345}
2346
2347
2348//!	Frees the file's data stream and removes all attributes
2349status_t
2350Inode::Free(Transaction& transaction)
2351{
2352	FUNCTION();
2353
2354	// Perhaps there should be an implementation of Inode::ShrinkStream() that
2355	// just frees the data_stream, but doesn't change the inode (since it is
2356	// freed anyway) - that would make an undelete command possible
2357	if (!IsSymLink() || (Flags() & INODE_LONG_SYMLINK) != 0) {
2358		status_t status = SetFileSize(transaction, 0);
2359		if (status < B_OK)
2360			return status;
2361	}
2362
2363	// Free all attributes, and remove their indices
2364	{
2365		// We have to limit the scope of AttributeIterator, so that its
2366		// destructor is not called after the inode is deleted
2367		AttributeIterator iterator(this);
2368
2369		char name[B_FILE_NAME_LENGTH];
2370		uint32 type;
2371		size_t length;
2372		ino_t id;
2373		while (iterator.GetNext(name, &length, &type, &id) == B_OK) {
2374			RemoveAttribute(transaction, name);
2375		}
2376	}
2377
2378	if (WriteBack(transaction) < B_OK)
2379		return B_IO_ERROR;
2380
2381	return fVolume->Free(transaction, BlockRun());
2382}
2383
2384
2385//!	Synchronizes (writes back to disk) the file stream of the inode.
2386status_t
2387Inode::Sync()
2388{
2389	if (FileCache())
2390		return file_cache_sync(FileCache());
2391
2392	// We may also want to flush the attribute's data stream to
2393	// disk here... (do we?)
2394
2395	if (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0)
2396		return B_OK;
2397
2398	InodeReadLocker locker(this);
2399
2400	data_stream* data = &Node().data;
2401	status_t status = B_OK;
2402
2403	// flush direct range
2404
2405	for (int32 i = 0; i < NUM_DIRECT_BLOCKS; i++) {
2406		if (data->direct[i].IsZero())
2407			return B_OK;
2408
2409		status = block_cache_sync_etc(fVolume->BlockCache(),
2410			fVolume->ToBlock(data->direct[i]), data->direct[i].Length());
2411		if (status != B_OK)
2412			return status;
2413	}
2414
2415	// flush indirect range
2416
2417	if (data->max_indirect_range == 0)
2418		return B_OK;
2419
2420	CachedBlock cached(fVolume);
2421	off_t block = fVolume->ToBlock(data->indirect);
2422	int32 count = fVolume->BlockSize() / sizeof(block_run);
2423
2424	for (int32 j = 0; j < data->indirect.Length(); j++) {
2425		status = cached.SetTo(block + j);
2426		if (status != B_OK)
2427			return status;
2428
2429		block_run* runs = (block_run*)cached.Block();
2430
2431		for (int32 i = 0; i < count; i++) {
2432			if (runs[i].IsZero())
2433				return B_OK;
2434
2435			status = block_cache_sync_etc(fVolume->BlockCache(),
2436				fVolume->ToBlock(runs[i]), runs[i].Length());
2437			if (status != B_OK)
2438				return status;
2439		}
2440	}
2441
2442	// flush double indirect range
2443
2444	if (data->max_double_indirect_range == 0)
2445		return B_OK;
2446
2447	off_t indirectBlock = fVolume->ToBlock(data->double_indirect);
2448
2449	for (int32 l = 0; l < data->double_indirect.Length(); l++) {
2450		status = cached.SetTo(indirectBlock + l);
2451		if (status != B_OK)
2452			return status;
2453
2454		block_run* indirectRuns = (block_run*)cached.Block();
2455		CachedBlock directCached(fVolume);
2456
2457		for (int32 k = 0; k < count; k++) {
2458			if (indirectRuns[k].IsZero())
2459				return B_OK;
2460
2461			block = fVolume->ToBlock(indirectRuns[k]);
2462			for (int32 j = 0; j < indirectRuns[k].Length(); j++) {
2463				status = directCached.SetTo(block + j);
2464				if (status != B_OK)
2465					return status;
2466
2467				block_run* runs = (block_run*)directCached.Block();
2468
2469				for (int32 i = 0; i < count; i++) {
2470					if (runs[i].IsZero())
2471						return B_OK;
2472
2473					// TODO: combine single block_runs to bigger ones when
2474					// they are adjacent
2475					status = block_cache_sync_etc(fVolume->BlockCache(),
2476						fVolume->ToBlock(runs[i]), runs[i].Length());
2477					if (status != B_OK)
2478						return status;
2479				}
2480			}
2481		}
2482	}
2483	return B_OK;
2484}
2485
2486
2487//	#pragma mark - TransactionListener implementation
2488
2489
2490void
2491Inode::TransactionDone(bool success)
2492{
2493	if (!success) {
2494		// Revert any changes made to the cached bfs_inode
2495		// TODO: return code gets eaten
2496		UpdateNodeFromDisk();
2497	}
2498}
2499
2500
2501void
2502Inode::RemovedFromTransaction()
2503{
2504	Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
2505
2506	// See AddInode() why we do this here
2507	if ((Flags() & INODE_DELETED) != 0)
2508		fVolume->RemovedInodes().Add(this);
2509
2510	rw_lock_write_unlock(&Lock());
2511
2512	if (!fVolume->IsInitializing())
2513		put_vnode(fVolume->FSVolume(), ID());
2514}
2515
2516
2517//	#pragma mark - creation/deletion
2518
2519
2520status_t
2521Inode::Remove(Transaction& transaction, const char* name, ino_t* _id,
2522	bool isDirectory, bool force)
2523{
2524	if (fTree == NULL)
2525		RETURN_ERROR(B_BAD_VALUE);
2526
2527	WriteLockInTransaction(transaction);
2528
2529	// does the file even exist?
2530	off_t id;
2531	if (fTree->Find((uint8*)name, (uint16)strlen(name), &id) < B_OK)
2532		return B_ENTRY_NOT_FOUND;
2533
2534	if (_id)
2535		*_id = id;
2536
2537	Vnode vnode(fVolume, id);
2538	Inode* inode;
2539	status_t status = vnode.Get(&inode);
2540	if (status < B_OK) {
2541		REPORT_ERROR(status);
2542		return fTree->Remove(transaction, name, id);
2543	}
2544
2545	T(Remove(inode, name));
2546	inode->WriteLockInTransaction(transaction);
2547
2548	// Inode::IsContainer() is true also for indices (furthermore, the S_IFDIR
2549	// bit is set for indices in BFS, not for attribute directories) - but you
2550	// should really be able to do whatever you want with your indices
2551	// without having to remove all files first :)
2552	if (!inode->IsIndex() && !force) {
2553		// if it's not of the correct type, don't delete it!
2554		if (inode->IsContainer() != isDirectory)
2555			return isDirectory ? B_NOT_A_DIRECTORY : B_IS_A_DIRECTORY;
2556
2557		// only delete empty directories
2558		if (isDirectory && !inode->IsEmpty())
2559			return B_DIRECTORY_NOT_EMPTY;
2560	}
2561
2562	// remove_vnode() allows the inode to be accessed until the last put_vnode()
2563	status = remove_vnode(fVolume->FSVolume(), id);
2564	if (status != B_OK)
2565		return status;
2566
2567	if (fTree->Remove(transaction, name, id) != B_OK && !force) {
2568		unremove_vnode(fVolume->FSVolume(), id);
2569		RETURN_ERROR(B_ERROR);
2570	}
2571
2572#ifdef DEBUG
2573	if (fTree->Find((uint8*)name, (uint16)strlen(name), &id) == B_OK) {
2574		DIE(("deleted entry still there"));
2575	}
2576#endif
2577
2578	ContainerContentsChanged(transaction);
2579
2580	// update the inode, so that no one will ever doubt it's deleted :-)
2581	inode->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
2582	inode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
2583
2584	// In balance to the Inode::Create() method, the main indices
2585	// are updated here (name, size, & last_modified)
2586
2587	Index index(fVolume);
2588	if (inode->InNameIndex()) {
2589		index.RemoveName(transaction, name, inode);
2590			// If removing from the index fails, it is not regarded as a
2591			// fatal error and will not be reported back!
2592			// Deleted inodes won't be visible in queries anyway.
2593	}
2594
2595	if (inode->InSizeIndex())
2596		index.RemoveSize(transaction, inode);
2597	if (inode->InLastModifiedIndex())
2598		index.RemoveLastModified(transaction, inode);
2599
2600	return inode->WriteBack(transaction);
2601}
2602
2603
2604/*!	Creates the inode with the specified \a parent directory, and automatically
2605	adds the created inode to that parent directory. If an attribute directory
2606	is created, it will also automatically  be added to the \a parent inode as
2607	such. However, the indices root node, and the regular root node won't be
2608	added to the superblock.
2609	It will also create the initial B+tree for the inode if it's a directory
2610	of any kind.
2611	\a name may be \c NULL, but only if no \a parent is given.
2612	If the "_id" or "_inode" variable is given and non-NULL to store the
2613	inode's ID, the inode stays locked - you have to call put_vnode() if you
2614	don't use it anymore.
2615	If the node already exists, this method will fail if \c O_EXCL is set, or
2616	it's a directory or a symlink. Otherwise, it will just be returned.
2617	If \c O_TRUNC has been specified, the file will also be truncated.
2618*/
2619status_t
2620Inode::Create(Transaction& transaction, Inode* parent, const char* name,
2621	int32 mode, int openMode, uint32 type, bool* _created, ino_t* _id,
2622	Inode** _inode, fs_vnode_ops* vnodeOps, uint32 publishFlags)
2623{
2624	FUNCTION_START(("name = %s, mode = %" B_PRId32 "\n", name, mode));
2625
2626	block_run parentRun = parent ? parent->BlockRun() : block_run::Run(0, 0, 0);
2627	Volume* volume = transaction.GetVolume();
2628	BPlusTree* tree = NULL;
2629
2630	if (parent != NULL && (mode & S_ATTR_DIR) == 0 && parent->IsContainer()) {
2631		// check if the file already exists in the directory
2632		tree = parent->Tree();
2633	}
2634
2635	if (parent != NULL) {
2636		// the parent directory is locked during the whole inode creation
2637		parent->WriteLockInTransaction(transaction);
2638	}
2639
2640	if (parent != NULL && !volume->IsInitializing() && parent->IsContainer()) {
2641		// don't create anything in removed directories
2642		bool removed;
2643		if (get_vnode_removed(volume->FSVolume(), parent->ID(), &removed)
2644				== B_OK && removed) {
2645			RETURN_ERROR(B_ENTRY_NOT_FOUND);
2646		}
2647	}
2648
2649	if (tree != NULL) {
2650		// Does the file already exist?
2651		off_t offset;
2652		if (tree->Find((uint8*)name, (uint16)strlen(name), &offset) == B_OK) {
2653			// Return if the file should be a directory/link or opened in
2654			// exclusive mode
2655			if (S_ISDIR(mode) || S_ISLNK(mode) || (openMode & O_EXCL) != 0)
2656				return B_FILE_EXISTS;
2657
2658			Vnode vnode(volume, offset);
2659			Inode* inode;
2660			status_t status = vnode.Get(&inode);
2661			if (status != B_OK) {
2662				REPORT_ERROR(status);
2663				return B_ENTRY_NOT_FOUND;
2664			}
2665
2666			if (inode->IsDirectory() && (openMode & O_RWMASK) != O_RDONLY)
2667				return B_IS_A_DIRECTORY;
2668			if ((openMode & O_DIRECTORY) != 0 && !inode->IsDirectory())
2669				return B_NOT_A_DIRECTORY;
2670
2671			// we want to open the file, so we should have the rights to do so
2672			if (inode->CheckPermissions(open_mode_to_access(openMode)
2673					| ((openMode & O_TRUNC) != 0 ? W_OK : 0)) != B_OK)
2674				return B_NOT_ALLOWED;
2675
2676			if ((openMode & O_TRUNC) != 0) {
2677				// truncate the existing file
2678				inode->WriteLockInTransaction(transaction);
2679
2680				status_t status = inode->SetFileSize(transaction, 0);
2681				if (status == B_OK)
2682					status = inode->WriteBack(transaction);
2683
2684				if (status != B_OK)
2685					return status;
2686			}
2687
2688			if (_created)
2689				*_created = false;
2690			if (_id)
2691				*_id = inode->ID();
2692			if (_inode)
2693				*_inode = inode;
2694
2695			// Only keep the vnode in memory if the _id or _inode pointer is
2696			// provided
2697			if (_id != NULL || _inode != NULL)
2698				vnode.Keep();
2699
2700			return B_OK;
2701		}
2702	} else if (parent != NULL && (mode & S_ATTR_DIR) == 0) {
2703		return B_BAD_VALUE;
2704	} else if ((openMode & O_DIRECTORY) != 0) {
2705		// TODO: we might need to return B_NOT_A_DIRECTORY here
2706		return B_ENTRY_NOT_FOUND;
2707	}
2708
2709	status_t status;
2710
2711	// do we have the power to create new files at all?
2712	if (parent != NULL && (status = parent->CheckPermissions(W_OK)) != B_OK)
2713		RETURN_ERROR(status);
2714
2715	// allocate space for the new inode
2716	InodeAllocator allocator(transaction);
2717	block_run run;
2718	Inode* inode;
2719	status = allocator.New(&parentRun, mode, publishFlags, run, vnodeOps,
2720		&inode);
2721	if (status < B_OK)
2722		return status;
2723
2724	T(Create(inode, parent, name, mode, openMode, type));
2725
2726	// Initialize the parts of the bfs_inode structure that
2727	// InodeAllocator::New() hasn't touched yet
2728
2729	bfs_inode* node = &inode->Node();
2730
2731	if (parent == NULL) {
2732		// we set the parent to itself in this case
2733		// (only happens for the root and indices node)
2734		node->parent = run;
2735	} else
2736		node->parent = parentRun;
2737
2738	node->uid = HOST_ENDIAN_TO_BFS_INT32(geteuid());
2739	node->gid = HOST_ENDIAN_TO_BFS_INT32(parent
2740		? parent->Node().GroupID() : getegid());
2741		// the group ID is inherited from the parent, if available
2742
2743	node->type = HOST_ENDIAN_TO_BFS_INT32(type);
2744
2745	inode->WriteBack(transaction);
2746		// make sure the initialized node is available to others
2747
2748	// only add the name to regular files, directories, or symlinks
2749	// don't add it to attributes, or indices
2750	if (tree && inode->IsRegularNode()
2751		&& inode->SetName(transaction, name) != B_OK)
2752		return B_ERROR;
2753
2754	// Initialize b+tree if it's a directory (and add "." & ".." if it's
2755	// a standard directory for files - not for attributes or indices)
2756	if (inode->IsContainer()) {
2757		status = allocator.CreateTree();
2758		if (status != B_OK)
2759			return status;
2760	}
2761
2762	// Add a link to the inode from the parent, depending on its type
2763	// (the vnode is not published yet, so it is safe to make the inode
2764	// accessable to the file system here)
2765	if (tree != NULL) {
2766		status = tree->Insert(transaction, name, inode->ID());
2767	} else if (parent != NULL && (mode & S_ATTR_DIR) != 0) {
2768		parent->Attributes() = run;
2769		status = parent->WriteBack(transaction);
2770	}
2771
2772	// Note, we only care if the inode could be made accessable for the
2773	// two cases above; the root node or the indices root node must
2774	// handle this case on their own (or other cases where "parent" is
2775	// NULL)
2776	if (status != B_OK)
2777		RETURN_ERROR(status);
2778
2779	// Update the main indices (name, size & last_modified)
2780	// (live queries might want to access us after this)
2781
2782	Index index(volume);
2783	if (inode->InNameIndex() && name != NULL) {
2784		// the name index only contains regular files
2785		// (but not the root node where name == NULL)
2786		status = index.InsertName(transaction, name, inode);
2787		if (status != B_OK && status != B_BAD_INDEX) {
2788			// We have to remove the node from the parent at this point,
2789			// because the InodeAllocator destructor can't handle this
2790			// case (and if it fails, we can't do anything about it...)
2791			if (tree)
2792				tree->Remove(transaction, name, inode->ID());
2793			else if (parent != NULL && (mode & S_ATTR_DIR) != 0)
2794				parent->Node().attributes.SetTo(0, 0, 0);
2795
2796			RETURN_ERROR(status);
2797		}
2798	}
2799
2800	if (parent != NULL && parent->IsContainer())
2801		parent->ContainerContentsChanged(transaction);
2802
2803	inode->UpdateOldLastModified();
2804
2805	// The "size" & "last_modified" indices don't contain directories.
2806	// If adding to these indices fails, the inode creation will not be
2807	// harmed; they are considered less important than the "name" index.
2808	if (inode->InSizeIndex())
2809		index.InsertSize(transaction, inode);
2810	if (inode->InLastModifiedIndex())
2811		index.InsertLastModified(transaction, inode);
2812
2813	if (inode->NeedsFileCache()) {
2814		inode->SetFileCache(file_cache_create(volume->ID(), inode->ID(),
2815			inode->Size()));
2816		inode->SetMap(file_map_create(volume->ID(), inode->ID(),
2817			inode->Size()));
2818
2819		if (inode->FileCache() == NULL || inode->Map() == NULL)
2820			return B_NO_MEMORY;
2821	}
2822
2823	// Everything worked well until this point, we have a fully
2824	// initialized inode, and we want to keep it
2825	allocator.Keep(vnodeOps, publishFlags);
2826
2827	if (_created)
2828		*_created = true;
2829	if (_id != NULL)
2830		*_id = inode->ID();
2831	if (_inode != NULL)
2832		*_inode = inode;
2833
2834	// if either _id or _inode is passed, we will keep the inode locked
2835	if (_id == NULL && _inode == NULL)
2836		put_vnode(volume->FSVolume(), inode->ID());
2837
2838	return B_OK;
2839}
2840
2841
2842//	#pragma mark - AttributeIterator
2843
2844
2845AttributeIterator::AttributeIterator(Inode* inode)
2846	:
2847	fCurrentSmallData(0),
2848	fInode(inode),
2849	fAttributes(NULL),
2850	fIterator(NULL),
2851	fBuffer(NULL)
2852{
2853	inode->_AddIterator(this);
2854}
2855
2856
2857AttributeIterator::~AttributeIterator()
2858{
2859	if (fAttributes)
2860		put_vnode(fAttributes->GetVolume()->FSVolume(), fAttributes->ID());
2861
2862	delete fIterator;
2863	fInode->_RemoveIterator(this);
2864}
2865
2866
2867status_t
2868AttributeIterator::Rewind()
2869{
2870	fCurrentSmallData = 0;
2871
2872	if (fIterator != NULL)
2873		fIterator->Rewind();
2874
2875	return B_OK;
2876}
2877
2878
2879status_t
2880AttributeIterator::GetNext(char* name, size_t* _length, uint32* _type,
2881	ino_t* _id)
2882{
2883	// read attributes out of the small data section
2884
2885	if (fCurrentSmallData >= 0) {
2886		NodeGetter nodeGetter(fInode->GetVolume());
2887		status_t status = nodeGetter.SetTo(fInode);
2888		if (status != B_OK)
2889			return status;
2890
2891		const bfs_inode* node = nodeGetter.Node();
2892		const small_data* item = ((bfs_inode*)node)->SmallDataStart();
2893
2894		RecursiveLocker _(&fInode->SmallDataLock());
2895
2896		int32 index = 0;
2897		for (; !item->IsLast(node); item = item->Next(), index++) {
2898			if (item->NameSize() == FILE_NAME_NAME_LENGTH
2899				&& *item->Name() == FILE_NAME_NAME)
2900				continue;
2901
2902			if (index >= fCurrentSmallData)
2903				break;
2904		}
2905
2906		if (!item->IsLast(node)) {
2907			strncpy(name, item->Name(), B_FILE_NAME_LENGTH);
2908			*_type = item->Type();
2909			*_length = item->NameSize();
2910			*_id = (ino_t)index;
2911
2912			fCurrentSmallData = index + 1;
2913			return B_OK;
2914		}
2915
2916		// stop traversing the small_data section
2917		fCurrentSmallData = -1;
2918	}
2919
2920	// read attributes out of the attribute directory
2921
2922	if (fInode->Attributes().IsZero())
2923		return B_ENTRY_NOT_FOUND;
2924
2925	Volume* volume = fInode->GetVolume();
2926
2927	// if you haven't yet access to the attributes directory, get it
2928	if (fAttributes == NULL) {
2929		if (get_vnode(volume->FSVolume(), volume->ToVnode(fInode->Attributes()),
2930				(void**)&fAttributes) != B_OK) {
2931			FATAL(("get_vnode() failed in AttributeIterator::GetNext(ino_t"
2932				" = %" B_PRIdINO ",name = \"%s\")\n", fInode->ID(), name));
2933			return B_ENTRY_NOT_FOUND;
2934		}
2935
2936		BPlusTree* tree = fAttributes->Tree();
2937		if (tree == NULL
2938			|| (fIterator = new(std::nothrow) TreeIterator(tree)) == NULL) {
2939			FATAL(("could not get tree in AttributeIterator::GetNext(ino_t"
2940				" = %" B_PRIdINO ",name = \"%s\")\n", fInode->ID(), name));
2941			return B_ENTRY_NOT_FOUND;
2942		}
2943	}
2944
2945	uint16 length;
2946	ino_t id;
2947	status_t status = fIterator->GetNextEntry(name, &length,
2948		B_FILE_NAME_LENGTH, &id);
2949	if (status != B_OK)
2950		return status;
2951
2952	Vnode vnode(volume, id);
2953	Inode* attribute;
2954	if ((status = vnode.Get(&attribute)) == B_OK) {
2955		*_type = attribute->Type();
2956		*_length = length;
2957		*_id = id;
2958	}
2959
2960	return status;
2961}
2962
2963
2964void
2965AttributeIterator::Update(uint16 index, int8 change)
2966{
2967	// fCurrentSmallData points already to the next item
2968	if (index < fCurrentSmallData)
2969		fCurrentSmallData += change;
2970}
2971
2972