1/* BlockAllocator - block bitmap handling and allocation policies
2**
3** Initial version by Axel Dörfler, axeld@pinc-software.de
4** This file may be used under the terms of the OpenBeOS License.
5*/
6
7
8#include "Debug.h"
9#include "BlockAllocator.h"
10#include "Volume.h"
11#include "Inode.h"
12#include "BPlusTree.h"
13#include "Stack.h"
14#include "bfs_control.h"
15
16#include <util/kernel_cpp.h>
17#include <stdlib.h>
18
19#ifdef USER
20#	define spawn_kernel_thread spawn_thread
21#endif
22
23
24// Things the BlockAllocator should do:
25
26// - find a range of blocks of a certain size nearby a specific position
27// - allocating an unsharp range of blocks for pre-allocation
28// - free blocks
29// - know how to deal with each allocation, special handling for directories,
30//   files, symlinks, etc. (type sensitive allocation policies)
31
32// What makes the code complicated is the fact that we are not just reading
33// in the whole bitmap and operate on that in memory - e.g. a 13 GB partition
34// with a block size of 2048 bytes already has a 800kB bitmap, and the size
35// of partitions will grow even more - so that's not an option.
36// Instead we are reading in every block when it's used - since an allocation
37// group can span several blocks in the block bitmap, the AllocationBlock
38// class is there to make handling those easier.
39
40// The current implementation is only slightly optimized and could probably
41// be improved a lot. Furthermore, the allocation policies used here should
42// have some real world tests.
43
44
45struct check_cookie {
46	check_cookie() {}
47
48	block_run			current;
49	Inode				*parent;
50	mode_t				parent_mode;
51	Stack<block_run>	stack;
52	TreeIterator		*iterator;
53};
54
55
56class AllocationBlock : public CachedBlock {
57	public:
58		AllocationBlock(Volume *volume);
59
60		void Allocate(uint16 start, uint16 numBlocks);
61		void Free(uint16 start, uint16 numBlocks);
62		inline bool IsUsed(uint16 block);
63
64		status_t SetTo(AllocationGroup &group, uint16 block);
65
66		uint32 NumBlockBits() const { return fNumBits; }
67		uint32 &Block(int32 index) { return ((uint32 *)fBlock)[index]; }
68
69	private:
70		uint32 fNumBits;
71};
72
73
74class AllocationGroup {
75	public:
76		AllocationGroup();
77
78		void AddFreeRange(int32 start, int32 blocks);
79		bool IsFull() const { return fFreeBits == 0; }
80
81		status_t Allocate(Transaction *transaction, uint16 start, int32 length);
82		status_t Free(Transaction *transaction, uint16 start, int32 length);
83
84		uint32 fNumBits;
85		int32 fStart;
86		int32 fFirstFree, fLargest, fLargestFirst;
87			// ToDo: fLargest & fLargestFirst are not maintained (and therefore used) yet!
88		int32 fFreeBits;
89};
90
91
92AllocationBlock::AllocationBlock(Volume *volume)
93	: CachedBlock(volume)
94{
95}
96
97
98status_t
99AllocationBlock::SetTo(AllocationGroup &group, uint16 block)
100{
101	// 8 blocks per byte
102	fNumBits = fVolume->BlockSize() << 3;
103	// the last group may have less bits in the last block
104	if ((block + 1) * fNumBits > group.fNumBits)
105		fNumBits = group.fNumBits % fNumBits;
106
107	return CachedBlock::SetTo(group.fStart + block) != NULL ? B_OK : B_ERROR;
108}
109
110
111bool
112AllocationBlock::IsUsed(uint16 block)
113{
114	if (block > fNumBits)
115		return true;
116	// the block bitmap is accessed in 32-bit blocks
117	return Block(block >> 5) & HOST_ENDIAN_TO_BFS_INT32(1UL << (block % 32));
118}
119
120
121void
122AllocationBlock::Allocate(uint16 start, uint16 numBlocks)
123{
124	ASSERT(start < fNumBits);
125	ASSERT(uint32(start + numBlocks) <= fNumBits);
126
127	if (uint32(start + numBlocks) > fNumBits) {
128		FATAL(("Allocation::Allocate(): tried to allocate too many blocks: %u (numBlocks = %lu)!\n", numBlocks, fNumBits));
129		DIE(("Allocation::Allocate(): tried to allocate too many blocks"));
130	}
131
132	int32 block = start >> 5;
133
134	while (numBlocks > 0) {
135		uint32 mask = 0;
136		for (int32 i = start % 32; i < 32 && numBlocks; i++, numBlocks--)
137			mask |= 1UL << (i % 32);
138
139#ifdef DEBUG
140		// check for already set blocks
141		if (HOST_ENDIAN_TO_BFS_INT32(mask) & ((uint32 *)fBlock)[block]) {
142			FATAL(("AllocationBlock::Allocate(): some blocks are already allocated, start = %u, numBlocks = %u\n", start, numBlocks));
143			DEBUGGER(("blocks already set!"));
144		}
145#endif
146		Block(block++) |= HOST_ENDIAN_TO_BFS_INT32(mask);
147		start = 0;
148	}
149}
150
151
152void
153AllocationBlock::Free(uint16 start, uint16 numBlocks)
154{
155	ASSERT(start < fNumBits);
156	ASSERT(uint32(start + numBlocks) <= fNumBits);
157
158	if (uint32(start + numBlocks) > fNumBits) {
159		FATAL(("Allocation::Free(): tried to free too many blocks: %u (numBlocks = %lu)!\n", numBlocks, fNumBits));
160		DIE(("Allocation::Free(): tried to free too many blocks"));
161	}
162
163	int32 block = start >> 5;
164
165	while (numBlocks > 0) {
166		uint32 mask = 0;
167		for (int32 i = start % 32; i < 32 && numBlocks; i++, numBlocks--)
168			mask |= 1UL << (i % 32);
169
170		Block(block++) &= HOST_ENDIAN_TO_BFS_INT32(~mask);
171		start = 0;
172	}
173}
174
175
176//	#pragma mark -
177
178
179AllocationGroup::AllocationGroup()
180	:
181	fFirstFree(-1),
182	fLargest(-1),
183	fLargestFirst(-1),
184	fFreeBits(0)
185{
186}
187
188
189void
190AllocationGroup::AddFreeRange(int32 start, int32 blocks)
191{
192	//D(if (blocks > 512)
193	//	PRINT(("range of %ld blocks starting at %ld\n",blocks,start)));
194
195	if (fFirstFree == -1)
196		fFirstFree = start;
197
198	if (fLargest < blocks) {
199		fLargest = blocks;
200		fLargestFirst = start;
201	}
202
203	fFreeBits += blocks;
204}
205
206
207/** Allocates the specified run in the allocation group.
208 *	Doesn't check if the run is valid or already allocated
209 *	partially, nor does it maintain the free ranges hints
210 *	or the volume's used blocks count.
211 *	It only does the low-level work of allocating some bits
212 *	in the block bitmap.
213 *	Assumes that the block bitmap lock is hold.
214 */
215
216status_t
217AllocationGroup::Allocate(Transaction *transaction, uint16 start, int32 length)
218{
219	if (start > fNumBits)
220		return B_ERROR;
221
222	// Update the allocation group info
223	// ToDo: this info will be incorrect if something goes wrong later
224	// Note, the fFirstFree block doesn't have to be really free
225	if (start == fFirstFree)
226		fFirstFree = start + length;
227	fFreeBits -= length;
228
229	Volume *volume = transaction->GetVolume();
230
231	// calculate block in the block bitmap and position within
232	uint32 bitsPerBlock = volume->BlockSize() << 3;
233	uint32 block = start / bitsPerBlock;
234	start = start % bitsPerBlock;
235
236	AllocationBlock cached(volume);
237
238	while (length > 0) {
239		if (cached.SetTo(*this, block) < B_OK)
240			RETURN_ERROR(B_IO_ERROR);
241
242		uint32 numBlocks = length;
243		if (start + numBlocks > cached.NumBlockBits())
244			numBlocks = cached.NumBlockBits() - start;
245
246		cached.Allocate(start, numBlocks);
247
248		if (cached.WriteBack(transaction) < B_OK)
249			return B_IO_ERROR;
250
251		length -= numBlocks;
252		start = 0;
253		block++;
254	}
255
256	return B_OK;
257}
258
259
260/** Frees the specified run in the allocation group.
261 *	Doesn't check if the run is valid or was not completely
262 *	allocated, nor does it maintain the free ranges hints
263 *	or the volume's used blocks count.
264 *	It only does the low-level work of freeing some bits
265 *	in the block bitmap.
266 *	Assumes that the block bitmap lock is hold.
267 */
268
269status_t
270AllocationGroup::Free(Transaction *transaction, uint16 start, int32 length)
271{
272	if (start > fNumBits)
273		return B_ERROR;
274
275	// Update the allocation group info
276	// ToDo: this info will be incorrect if something goes wrong later
277	if (fFirstFree > start)
278		fFirstFree = start;
279	fFreeBits += length;
280
281	Volume *volume = transaction->GetVolume();
282
283	// calculate block in the block bitmap and position within
284	uint32 bitsPerBlock = volume->BlockSize() << 3;
285	uint32 block = start / bitsPerBlock;
286	start = start % bitsPerBlock;
287
288	AllocationBlock cached(volume);
289
290	while (length > 0) {
291		if (cached.SetTo(*this, block) < B_OK)
292			RETURN_ERROR(B_IO_ERROR);
293
294		uint16 freeLength = length;
295		if (uint32(start + length) > cached.NumBlockBits())
296			freeLength = cached.NumBlockBits() - start;
297
298		cached.Free(start, freeLength);
299
300		if (cached.WriteBack(transaction) < B_OK)
301			return B_IO_ERROR;
302
303		length -= freeLength;
304		start = 0;
305		block++;
306	}
307	return B_OK;
308}
309
310
311//	#pragma mark -
312
313
314BlockAllocator::BlockAllocator(Volume *volume)
315	:
316	fVolume(volume),
317	fLock("bfs allocator"),
318	fGroups(NULL),
319	fCheckBitmap(NULL)
320{
321}
322
323
324BlockAllocator::~BlockAllocator()
325{
326	delete[] fGroups;
327}
328
329
330status_t
331BlockAllocator::Initialize(bool full)
332{
333	if (fLock.InitCheck() < B_OK)
334		return B_ERROR;
335
336	fNumGroups = fVolume->AllocationGroups();
337	fBlocksPerGroup = fVolume->SuperBlock().BlocksPerAllocationGroup();
338	fGroups = new AllocationGroup[fNumGroups];
339	if (fGroups == NULL)
340		return B_NO_MEMORY;
341
342	if (!full)
343		return B_OK;
344
345	fLock.Lock();
346		// the lock will be released by the initialize() function
347
348	thread_id id = spawn_kernel_thread((thread_func)BlockAllocator::initialize,
349			"bfs block allocator", B_LOW_PRIORITY, (void *)this);
350	if (id < B_OK)
351		return initialize(this);
352
353	return resume_thread(id);
354}
355
356
357status_t
358BlockAllocator::InitializeAndClearBitmap(Transaction &transaction)
359{
360	status_t status = Initialize(false);
361	if (status < B_OK)
362		return status;
363
364	uint32 blocks = fBlocksPerGroup;
365	uint32 numBits = 8 * blocks * fVolume->BlockSize();
366
367	uint32 *buffer = (uint32 *)malloc(numBits >> 3);
368	if (buffer == NULL)
369		RETURN_ERROR(B_NO_MEMORY);
370
371	memset(buffer, 0, numBits >> 3);
372
373	off_t offset = 1;
374
375	// initialize the AllocationGroup objects and clear the on-disk bitmap
376
377	for (int32 i = 0; i < fNumGroups; i++) {
378		if (cached_write(fVolume->Device(), offset, buffer, blocks, fVolume->BlockSize()) < B_OK)
379			return B_ERROR;
380
381		// the last allocation group may contain less blocks than the others
382		fGroups[i].fNumBits = i == fNumGroups - 1 ? fVolume->NumBlocks() - i * numBits : numBits;
383		fGroups[i].fStart = offset;
384		fGroups[i].fFirstFree = fGroups[i].fLargestFirst = 0;
385		fGroups[i].fFreeBits = fGroups[i].fLargest = fGroups[i].fNumBits;
386
387		offset += blocks;
388	}
389	free(buffer);
390
391	// reserve the boot block, the log area, and the block bitmap itself
392	uint32 reservedBlocks = fVolume->Log().Start() + fVolume->Log().Length();
393
394	if (fGroups[0].Allocate(&transaction, 0, reservedBlocks) < B_OK) {
395		FATAL(("could not allocate reserved space for block bitmap/log!\n"));
396		return B_ERROR;
397	}
398	fVolume->SuperBlock().used_blocks = HOST_ENDIAN_TO_BFS_INT64(reservedBlocks);
399
400	return B_OK;
401}
402
403
404status_t
405BlockAllocator::initialize(BlockAllocator *allocator)
406{
407	// The lock must already be held at this point
408
409	Volume *volume = allocator->fVolume;
410	uint32 blocks = allocator->fBlocksPerGroup;
411	uint32 numBits = 8 * blocks * volume->BlockSize();
412	off_t freeBlocks = 0;
413
414	uint32 *buffer = (uint32 *)malloc(numBits >> 3);
415	if (buffer == NULL) {
416		allocator->fLock.Unlock();
417		RETURN_ERROR(B_NO_MEMORY);
418	}
419
420	AllocationGroup *groups = allocator->fGroups;
421	off_t offset = 1;
422	int32 num = allocator->fNumGroups;
423
424	for (int32 i = 0;i < num;i++) {
425		if (cached_read(volume->Device(), offset, buffer, blocks, volume->BlockSize()) < B_OK)
426			break;
427
428		// the last allocation group may contain less blocks than the others
429		groups[i].fNumBits = i == num - 1 ? volume->NumBlocks() - i * numBits : numBits;
430		groups[i].fStart = offset;
431
432		// finds all free ranges in this allocation group
433		int32 start = -1, range = 0;
434		int32 size = groups[i].fNumBits, num = 0;
435
436		for (int32 k = 0;k < (size >> 2);k++) {
437			for (int32 j = 0; j < 32 && num < size; j++, num++) {
438				if (buffer[k] & (1UL << j)) {
439					if (range > 0) {
440						groups[i].AddFreeRange(start, range);
441						range = 0;
442					}
443				} else if (range++ == 0)
444					start = num;
445			}
446		}
447		if (range)
448			groups[i].AddFreeRange(start, range);
449
450		freeBlocks += groups[i].fFreeBits;
451
452		offset += blocks;
453	}
454	free(buffer);
455
456	// check if block bitmap and log area are reserved
457	uint32 reservedBlocks = volume->Log().Start() + volume->Log().Length();
458	if (allocator->CheckBlockRun(block_run::Run(0, 0, reservedBlocks)) < B_OK) {
459		Transaction transaction(volume, 0);
460		if (groups[0].Allocate(&transaction, 0, reservedBlocks) < B_OK) {
461			FATAL(("could not allocate reserved space for block bitmap/log!\n"));
462			volume->Panic();
463		} else {
464			FATAL(("space for block bitmap or log area was not reserved!\n"));
465
466			transaction.Done();
467		}
468	}
469
470	off_t usedBlocks = volume->NumBlocks() - freeBlocks;
471	if (volume->UsedBlocks() != usedBlocks) {
472		// If the disk in a dirty state at mount time, it's
473		// normal that the values don't match
474		INFORM(("volume reports %Ld used blocks, correct is %Ld\n", volume->UsedBlocks(), usedBlocks));
475		volume->SuperBlock().used_blocks = HOST_ENDIAN_TO_BFS_INT64(usedBlocks);
476	}
477
478	allocator->fLock.Unlock();
479	return B_OK;
480}
481
482
483status_t
484BlockAllocator::AllocateBlocks(Transaction *transaction, int32 group, uint16 start,
485	uint16 maximum, uint16 minimum, block_run &run)
486{
487	if (maximum == 0)
488		return B_BAD_VALUE;
489
490	FUNCTION_START(("group = %ld, start = %u, maximum = %u, minimum = %u\n", group, start, maximum, minimum));
491
492	AllocationBlock cached(fVolume);
493	Locker lock(fLock);
494
495	// the first scan through all allocation groups will look for the
496	// wanted maximum of blocks, the second scan will just look to
497	// satisfy the minimal requirement
498	uint16 numBlocks = maximum;
499
500	for (int32 i = 0; i < fNumGroups * 2; i++, group++, start = 0) {
501		group = group % fNumGroups;
502
503		if (start >= fGroups[group].fNumBits || fGroups[group].IsFull())
504			continue;
505
506		if (i >= fNumGroups) {
507			// if the minimum is the same as the maximum, it's not necessary to
508			// search for in the allocation groups a second time
509			if (maximum == minimum)
510				return B_DEVICE_FULL;
511
512			numBlocks = minimum;
513		}
514
515		// The wanted maximum is smaller than the largest free block in the group
516		// or already smaller than the minimum
517		// ToDo: disabled because it's currently not maintained after the first allocation
518		//if (numBlocks > fGroups[group].fLargest)
519		//	continue;
520
521		if (start < fGroups[group].fFirstFree)
522			start = fGroups[group].fFirstFree;
523
524		// there may be more than one block per allocation group - and
525		// we iterate through it to find a place for the allocation.
526		// (one allocation can't exceed one allocation group)
527
528		uint32 block = start / (fVolume->BlockSize() << 3);
529		int32 range = 0, rangeStart = 0;
530
531		for (; block < fBlocksPerGroup; block++) {
532			if (cached.SetTo(fGroups[group], block) < B_OK)
533				RETURN_ERROR(B_ERROR);
534
535			// find a block large enough to hold the allocation
536			for (uint32 bit = start % cached.NumBlockBits(); bit < cached.NumBlockBits(); bit++) {
537				if (!cached.IsUsed(bit)) {
538					if (range == 0) {
539						// start new range
540						rangeStart = block * cached.NumBlockBits() + bit;
541					}
542
543					// have we found a range large enough to hold numBlocks?
544					if (++range >= maximum)
545						break;
546				} else if (i >= fNumGroups && range >= minimum) {
547					// we have found a block larger than the required minimum (second pass)
548					break;
549				} else {
550					// end of a range
551					range = 0;
552				}
553			}
554
555			// if we found a suitable block, mark the blocks as in use, and write
556			// the updated block bitmap back to disk
557			if (range >= numBlocks) {
558				// adjust allocation size
559				if (numBlocks < maximum)
560					numBlocks = range;
561
562				if (fGroups[group].Allocate(transaction, rangeStart, numBlocks) < B_OK)
563					RETURN_ERROR(B_IO_ERROR);
564
565				run.allocation_group = HOST_ENDIAN_TO_BFS_INT32(group);
566				run.start = HOST_ENDIAN_TO_BFS_INT16(rangeStart);
567				run.length = HOST_ENDIAN_TO_BFS_INT16(numBlocks);
568
569				fVolume->SuperBlock().used_blocks =
570					HOST_ENDIAN_TO_BFS_INT64(fVolume->UsedBlocks() + numBlocks);
571					// We are not writing back the disk's superblock - it's
572					// either done by the journaling code, or when the disk
573					// is unmounted.
574					// If the value is not correct at mount time, it will be
575					// fixed anyway.
576
577				return B_OK;
578			}
579
580			// start from the beginning of the next block
581			start = 0;
582		}
583	}
584	return B_DEVICE_FULL;
585}
586
587
588status_t
589BlockAllocator::AllocateForInode(Transaction *transaction, const block_run *parent,
590	mode_t type, block_run &run)
591{
592	// apply some allocation policies here (AllocateBlocks() will break them
593	// if necessary) - we will start with those described in Dominic Giampaolo's
594	// "Practical File System Design", and see how good they work
595
596	// files are going in the same allocation group as its parent, sub-directories
597	// will be inserted 8 allocation groups after the one of the parent
598	uint16 group = parent->AllocationGroup();
599	if ((type & (S_DIRECTORY | S_INDEX_DIR | S_ATTR_DIR)) == S_DIRECTORY)
600		group += 8;
601
602	return AllocateBlocks(transaction, group, 0, 1, 1, run);
603}
604
605
606status_t
607BlockAllocator::Allocate(Transaction *transaction, const Inode *inode, off_t numBlocks,
608	block_run &run, uint16 minimum)
609{
610	if (numBlocks <= 0)
611		return B_ERROR;
612
613	// one block_run can't hold more data than there is in one allocation group
614	if (numBlocks > fGroups[0].fNumBits)
615		numBlocks = fGroups[0].fNumBits;
616
617	// since block_run.length is uint16, the largest number of blocks that
618	// can be covered by a block_run is 65535
619	// ToDo: if we drop compatibility, couldn't we do this any better?
620	// There are basically two possibilities:
621	// a) since a length of zero doesn't have any sense, take that for 65536 -
622	//    but that could cause many problems (bugs) in other areas
623	// b) reduce the maximum amount of blocks per block_run, so that the remaining
624	//    number of free blocks can be used in a useful manner (like 4 blocks) -
625	//    but that would also reduce the maximum file size
626	if (numBlocks > MAX_BLOCK_RUN_LENGTH)
627		numBlocks = MAX_BLOCK_RUN_LENGTH;
628
629	// apply some allocation policies here (AllocateBlocks() will break them
630	// if necessary)
631	uint16 group = inode->BlockRun().AllocationGroup();
632	uint16 start = 0;
633
634	// are there already allocated blocks? (then just try to allocate near the last one)
635	if (inode->Size() > 0) {
636		data_stream *data = &inode->Node()->data;
637		// ToDo: we currently don't care for when the data stream
638		// is already grown into the indirect ranges
639		if (data->max_double_indirect_range == 0
640			&& data->max_indirect_range == 0) {
641			// Since size > 0, there must be a valid block run in this stream
642			int32 last = 0;
643			for (; last < NUM_DIRECT_BLOCKS - 1; last++)
644				if (data->direct[last + 1].IsZero())
645					break;
646
647			group = data->direct[last].AllocationGroup();
648			start = data->direct[last].Start() + data->direct[last].Length();
649		}
650	} else if (inode->IsContainer() || inode->IsSymLink()) {
651		// directory and symbolic link data will go in the same allocation
652		// group as the inode is in but after the inode data
653		start = inode->BlockRun().Start();
654	} else {
655		// file data will start in the next allocation group
656		group = inode->BlockRun().AllocationGroup() + 1;
657	}
658
659	return AllocateBlocks(transaction, group, start, numBlocks, minimum, run);
660}
661
662
663status_t
664BlockAllocator::Free(Transaction *transaction, block_run run)
665{
666	Locker lock(fLock);
667
668	int32 group = run.AllocationGroup();
669	uint16 start = run.Start();
670	uint16 length = run.Length();
671
672	FUNCTION_START(("group = %ld, start = %u, length = %u\n", group, start, length));
673
674	// doesn't use Volume::IsValidBlockRun() here because it can check better
675	// against the group size (the last group may have a different length)
676	if (group < 0 || group >= fNumGroups
677		|| start > fGroups[group].fNumBits
678		|| uint32(start + length) > fGroups[group].fNumBits
679		|| length == 0) {
680		FATAL(("tried to free an invalid block_run (%ld, %u, %u)\n", group, start, length));
681		DEBUGGER(("tried to free invalid block_run"));
682		return B_BAD_VALUE;
683	}
684	// check if someone tries to free reserved areas at the beginning of the drive
685	if (group == 0 && start < uint32(fVolume->Log().Start() + fVolume->Log().Length())) {
686		FATAL(("tried to free a reserved block_run (%ld, %u, %u)\n", group, start, length));
687		DEBUGGER(("tried to free reserved block"));
688		return B_BAD_VALUE;
689	}
690#ifdef DEBUG
691	if (CheckBlockRun(run) < B_OK)
692		return B_BAD_DATA;
693#endif
694
695	if (fGroups[group].Free(transaction, start, length) < B_OK)
696		RETURN_ERROR(B_IO_ERROR);
697
698#ifdef DEBUG
699	if (CheckBlockRun(run, NULL, NULL, false) < B_OK)
700		DEBUGGER(("CheckBlockRun() reports allocated blocks (which were just freed)\n"));
701#endif
702
703	fVolume->SuperBlock().used_blocks =
704		HOST_ENDIAN_TO_BFS_INT64(fVolume->UsedBlocks() - run.Length());
705	return B_OK;
706}
707
708
709size_t
710BlockAllocator::BitmapSize() const
711{
712	return fVolume->BlockSize() * fNumGroups * fBlocksPerGroup;
713}
714
715
716//	#pragma mark -
717//	Functions to check the validity of the bitmap - they are used from
718//	the "chkbfs" command
719
720
721bool
722BlockAllocator::IsValidCheckControl(check_control *control)
723{
724	if (control == NULL
725		|| control->magic != BFS_IOCTL_CHECK_MAGIC) {
726		FATAL(("invalid check_control (%p)!\n", control));
727		return false;
728	}
729
730	return true;
731}
732
733
734status_t
735BlockAllocator::StartChecking(check_control *control)
736{
737	if (!IsValidCheckControl(control))
738		return B_BAD_VALUE;
739
740	status_t status = fLock.Lock();
741	if (status < B_OK)
742		return status;
743
744	size_t size = BitmapSize();
745	fCheckBitmap = (uint32 *)malloc(size);
746	if (fCheckBitmap == NULL) {
747		fLock.Unlock();
748		return B_NO_MEMORY;
749	}
750
751	check_cookie *cookie = new check_cookie();
752	if (cookie == NULL) {
753		free(fCheckBitmap);
754		fCheckBitmap = NULL;
755		fLock.Unlock();
756
757		return B_NO_MEMORY;
758	}
759
760	// initialize bitmap
761	memset(fCheckBitmap, 0, size);
762	for (int32 block = fVolume->Log().Start() + fVolume->Log().Length(); block-- > 0;)
763		SetCheckBitmapAt(block);
764
765	cookie->stack.Push(fVolume->Root());
766	cookie->stack.Push(fVolume->Indices());
767	cookie->iterator = NULL;
768	control->cookie = cookie;
769
770	fCheckCookie = cookie;
771		// to be able to restore nicely if "chkbfs" exited abnormally
772
773	// ToDo: check reserved area in bitmap!
774
775	return B_OK;
776}
777
778
779status_t
780BlockAllocator::StopChecking(check_control *control)
781{
782	check_cookie *cookie;
783	if (control == NULL)
784		cookie = fCheckCookie;
785	else
786		cookie = (check_cookie *)control->cookie;
787
788	if (cookie == NULL)
789		return B_ERROR;
790
791	if (cookie->iterator != NULL) {
792		delete cookie->iterator;
793		cookie->iterator = NULL;
794
795		// the current directory inode is still locked in memory
796		put_vnode(fVolume->ID(), fVolume->ToVnode(cookie->current));
797	}
798
799	// if CheckNextNode() could completely work through, we can
800	// fix any damages of the bitmap
801	if (control != NULL && control->status == B_ENTRY_NOT_FOUND) {
802		// calculate the number of used blocks in the check bitmap
803		size_t size = fVolume->BlockSize() * fNumGroups * fBlocksPerGroup;
804		off_t usedBlocks = 0LL;
805
806		// ToDo: update the allocation groups used blocks info
807		for (uint32 i = size >> 2; i-- > 0;) {
808			uint32 compare = 1;
809			for (int16 j = 0; j < 32; j++, compare <<= 1) {
810				if (compare & fCheckBitmap[i])
811					usedBlocks++;
812			}
813		}
814
815		control->stats.freed = fVolume->UsedBlocks() - usedBlocks + control->stats.missing;
816		if (control->stats.freed < 0)
817			control->stats.freed = 0;
818
819		// Should we fix errors? Were there any errors we can fix?
820		if (control->flags & BFS_FIX_BITMAP_ERRORS
821			&& (control->stats.freed != 0 || control->stats.missing != 0)) {
822			// if so, write the check bitmap back over the original one,
823			// and use transactions here to play safe - we even use several
824			// transactions, so that we don't blow the maximum log size
825			// on large disks; since we don't need to make this atomic
826			fVolume->SuperBlock().used_blocks = HOST_ENDIAN_TO_BFS_INT64(usedBlocks);
827
828			int32 blocksInBitmap = fNumGroups * fBlocksPerGroup;
829			int32 blockSize = fVolume->BlockSize();
830
831			for (int32 i = 0; i < blocksInBitmap; i += 512) {
832				Transaction transaction(fVolume, 1 + i);
833
834				int32 blocksToWrite = 512;
835				if (blocksToWrite + i > blocksInBitmap)
836					blocksToWrite = blocksInBitmap - i;
837
838				status_t status = transaction.WriteBlocks(1 + i,
839					(uint8 *)fCheckBitmap + i * blockSize, blocksToWrite);
840				if (status < B_OK) {
841					FATAL(("error writing bitmap: %s\n", strerror(status)));
842					break;
843				}
844				transaction.Done();
845			}
846		}
847	} else
848		FATAL(("BlockAllocator::CheckNextNode() didn't run through\n"));
849
850	free(fCheckBitmap);
851	fCheckBitmap = NULL;
852	fCheckCookie = NULL;
853	delete cookie;
854	fLock.Unlock();
855
856	return B_OK;
857}
858
859
860status_t
861BlockAllocator::CheckNextNode(check_control *control)
862{
863	if (!IsValidCheckControl(control))
864		return B_BAD_VALUE;
865
866	check_cookie *cookie = (check_cookie *)control->cookie;
867
868	while (true) {
869		if (cookie->iterator == NULL) {
870			if (!cookie->stack.Pop(&cookie->current)) {
871				// no more runs on the stack, we are obviously finished!
872				control->status = B_ENTRY_NOT_FOUND;
873				return B_ENTRY_NOT_FOUND;
874			}
875
876			// get iterator for the next directory
877
878#ifdef UNSAFE_GET_VNODE
879			RecursiveLocker locker(fVolume->Lock());
880#endif
881			Vnode vnode(fVolume, cookie->current);
882			Inode *inode;
883			if (vnode.Get(&inode) < B_OK) {
884				FATAL(("check: Could not open inode at %Ld\n", fVolume->ToBlock(cookie->current)));
885				continue;
886			}
887
888			if (!inode->IsContainer()) {
889				FATAL(("check: inode at %Ld should have been a directory\n", fVolume->ToBlock(cookie->current)));
890				continue;
891			}
892
893			BPlusTree *tree;
894			if (inode->GetTree(&tree) != B_OK) {
895				FATAL(("check: could not open b+tree from inode at %Ld\n", fVolume->ToBlock(cookie->current)));
896				continue;
897			}
898
899			cookie->parent = inode;
900			cookie->parent_mode = inode->Mode();
901
902			cookie->iterator = new TreeIterator(tree);
903			if (cookie->iterator == NULL)
904				RETURN_ERROR(B_NO_MEMORY);
905
906			// the inode must stay locked in memory until the iterator is freed
907			vnode.Keep();
908
909			// check the inode of the directory
910			control->errors = 0;
911			control->status = CheckInode(inode, control);
912
913			if (inode->GetName(control->name) < B_OK)
914				strcpy(control->name, "(node has no name)");
915
916			control->inode = inode->ID();
917			control->mode = inode->Mode();
918
919			return B_OK;
920		}
921
922		char name[B_FILE_NAME_LENGTH];
923		uint16 length;
924		vnode_id id;
925
926		status_t status = cookie->iterator->GetNextEntry(name, &length, B_FILE_NAME_LENGTH, &id);
927		if (status == B_ENTRY_NOT_FOUND) {
928			// there are no more entries in this iterator, free it and go on
929			delete cookie->iterator;
930			cookie->iterator = NULL;
931
932			// unlock the directory's inode from memory
933			put_vnode(fVolume->ID(), fVolume->ToVnode(cookie->current));
934
935			continue;
936		} else if (status == B_OK) {
937			// ignore "." and ".." entries
938			if (!strcmp(name, ".") || !strcmp(name, ".."))
939				continue;
940
941			// fill in the control data as soon as we have them
942			strlcpy(control->name, name, B_FILE_NAME_LENGTH);
943			control->inode = id;
944			control->errors = 0;
945
946			Vnode vnode(fVolume, id);
947			Inode *inode;
948			if (vnode.Get(&inode) < B_OK) {
949				FATAL(("Could not open inode ID %Ld!\n", id));
950				control->errors |= BFS_COULD_NOT_OPEN;
951				control->status = B_ERROR;
952				return B_OK;
953			}
954
955			// check if the inode's name is the same as in the b+tree
956			if (inode->IsRegularNode()) {
957				SimpleLocker locker(inode->SmallDataLock());
958
959				const char *localName = inode->Name();
960				if (localName == NULL || strcmp(localName, name)) {
961					control->errors |= BFS_NAMES_DONT_MATCH;
962					FATAL(("Names differ: tree \"%s\", inode \"%s\"\n", name, localName));
963				}
964			}
965
966			control->mode = inode->Mode();
967
968			// Check for the correct mode of the node (if the mode of the
969			// file don't fit to its parent, there is a serious problem)
970			if ((cookie->parent_mode & S_ATTR_DIR && !inode->IsAttribute())
971				|| (cookie->parent_mode & S_INDEX_DIR && !inode->IsIndex())
972				|| ((cookie->parent_mode & S_DIRECTORY | S_ATTR_DIR | S_INDEX_DIR) == S_DIRECTORY
973					&& inode->Mode() & (S_ATTR | S_ATTR_DIR | S_INDEX_DIR))) {
974				FATAL(("inode at %Ld is of wrong type: %o (parent %o at %Ld)!\n",
975					inode->BlockNumber(), inode->Mode(), cookie->parent_mode, cookie->parent->BlockNumber()));
976
977				// if we are allowed to fix errors, we should remove the file
978				if (control->flags & BFS_REMOVE_WRONG_TYPES
979					&& control->flags & BFS_FIX_BITMAP_ERRORS) {
980#ifdef UNSAFE_GET_VNODE
981					RecursiveLocker locker(fVolume->Lock());
982#endif
983					// it's safe to start a transaction, because Inode::Remove()
984					// won't touch the block bitmap (which we hold the lock for)
985					// if we set the INODE_DONT_FREE_SPACE flag - since we fix
986					// the bitmap anyway
987					Transaction transaction(fVolume, cookie->parent->BlockNumber());
988
989					inode->Node()->flags |= INODE_DONT_FREE_SPACE;
990					status = cookie->parent->Remove(&transaction, name, NULL, inode->IsContainer());
991					if (status == B_OK)
992						transaction.Done();
993				} else
994					status = B_ERROR;
995
996				control->errors |= BFS_WRONG_TYPE;
997				control->status = status;
998				return B_OK;
999			}
1000
1001			// If the inode has an attribute directory, push it on the stack
1002			if (!inode->Attributes().IsZero())
1003				cookie->stack.Push(inode->Attributes());
1004
1005			// push the directory on the stack so that it will be scanned later
1006			if (inode->IsContainer() && !inode->IsIndex())
1007				cookie->stack.Push(inode->BlockRun());
1008			else {
1009				// check it now
1010				control->status = CheckInode(inode, control);
1011
1012				return B_OK;
1013			}
1014		}
1015	}
1016	// is never reached
1017}
1018
1019
1020bool
1021BlockAllocator::CheckBitmapIsUsedAt(off_t block) const
1022{
1023	size_t size = BitmapSize();
1024	uint32 index = block / 32;	// 32bit resolution
1025	if (index > size / 4)
1026		return false;
1027
1028	return BFS_ENDIAN_TO_HOST_INT32(fCheckBitmap[index]) & (1UL << (block & 0x1f));
1029}
1030
1031
1032void
1033BlockAllocator::SetCheckBitmapAt(off_t block)
1034{
1035	size_t size = BitmapSize();
1036	uint32 index = block / 32;	// 32bit resolution
1037	if (index > size / 4)
1038		return;
1039
1040	fCheckBitmap[index] |= HOST_ENDIAN_TO_BFS_INT32(1UL << (block & 0x1f));
1041}
1042
1043
1044status_t
1045BlockAllocator::CheckBlockRun(block_run run, const char *type, check_control *control, bool allocated)
1046{
1047	if (run.AllocationGroup() < 0 || run.AllocationGroup() >= fNumGroups
1048		|| run.Start() > fGroups[run.AllocationGroup()].fNumBits
1049		|| uint32(run.Start() + run.Length()) > fGroups[run.AllocationGroup()].fNumBits
1050		|| run.length == 0) {
1051		PRINT(("%s: block_run(%ld, %u, %u) is invalid!\n", type, run.AllocationGroup(), run.Start(), run.Length()));
1052		if (control == NULL)
1053			return B_BAD_DATA;
1054
1055		control->errors |= BFS_INVALID_BLOCK_RUN;
1056		return B_OK;
1057	}
1058
1059	uint32 bitsPerBlock = fVolume->BlockSize() << 3;
1060	uint32 block = run.Start() / bitsPerBlock;
1061	uint32 pos = run.Start() % bitsPerBlock;
1062	int32 length = 0;
1063	off_t firstMissing = -1, firstSet = -1;
1064	off_t firstGroupBlock = (off_t)run.AllocationGroup() << fVolume->AllocationGroupShift();
1065
1066	AllocationBlock cached(fVolume);
1067
1068	for (; block < fBlocksPerGroup && length < run.Length(); block++, pos = 0) {
1069		if (cached.SetTo(fGroups[run.AllocationGroup()], block) < B_OK)
1070			RETURN_ERROR(B_IO_ERROR);
1071
1072		if (pos >= cached.NumBlockBits()) {
1073			// something very strange has happened...
1074			RETURN_ERROR(B_ERROR);
1075		}
1076
1077		while (length < run.Length() && pos < cached.NumBlockBits()) {
1078			if (cached.IsUsed(pos) != allocated) {
1079				if (control == NULL) {
1080					PRINT(("%s: block_run(%ld, %u, %u) is only partially allocated (pos = %ld, length = %ld)!\n",
1081						type, run.AllocationGroup(), run.Start(), run.Length(), pos, length));
1082					return B_BAD_DATA;
1083				}
1084				if (firstMissing == -1) {
1085					firstMissing = firstGroupBlock + pos + block * bitsPerBlock;
1086					control->errors |= BFS_MISSING_BLOCKS;
1087				}
1088				control->stats.missing++;
1089			} else if (firstMissing != -1) {
1090				PRINT(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are %sallocated!\n",
1091					type, run.allocation_group, run.start, run.length, firstMissing,
1092					firstGroupBlock + pos + block * bitsPerBlock - 1, allocated ? "not " : ""));
1093				firstMissing = -1;
1094			}
1095
1096			if (fCheckBitmap != NULL) {
1097				// Set the block in the check bitmap as well, but have a look if it
1098				// is already allocated first
1099				uint32 offset = pos + block * bitsPerBlock;
1100				if (CheckBitmapIsUsedAt(firstGroupBlock + offset)) {
1101					if (firstSet == -1) {
1102						firstSet = firstGroupBlock + offset;
1103						control->errors |= BFS_BLOCKS_ALREADY_SET;
1104					}
1105					control->stats.already_set++;
1106				} else {
1107					if (firstSet != -1) {
1108						FATAL(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are already set!\n",
1109							type, run.AllocationGroup(), run.Start(), run.Length(), firstSet, firstGroupBlock + offset - 1));
1110						firstSet = -1;
1111					}
1112					SetCheckBitmapAt(firstGroupBlock + offset);
1113				}
1114			}
1115			length++;
1116			pos++;
1117		}
1118
1119		if (block + 1 >= fBlocksPerGroup || length >= run.Length()) {
1120			if (firstMissing != -1)
1121				PRINT(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are not allocated!\n", type, run.AllocationGroup(), run.Start(), run.Length(), firstMissing, firstGroupBlock + pos + block * bitsPerBlock - 1));
1122			if (firstSet != -1)
1123				FATAL(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are already set!\n", type, run.AllocationGroup(), run.Start(), run.Length(), firstSet, firstGroupBlock + pos + block * bitsPerBlock - 1));
1124		}
1125	}
1126
1127	return B_OK;
1128}
1129
1130
1131status_t
1132BlockAllocator::CheckInode(Inode *inode, check_control *control)
1133{
1134	if (control != NULL && fCheckBitmap == NULL)
1135		return B_NO_INIT;
1136	if (inode == NULL)
1137		return B_BAD_VALUE;
1138
1139	status_t status = CheckBlockRun(inode->BlockRun(), "inode", control);
1140	if (status < B_OK)
1141		return status;
1142
1143	if (inode->IsSymLink() && (inode->Flags() & INODE_LONG_SYMLINK) == 0) {
1144		// symlinks may not have a valid data stream
1145		if (strlen(inode->Node()->short_symlink) >= SHORT_SYMLINK_NAME_LENGTH)
1146			return B_BAD_DATA;
1147
1148		return B_OK;
1149	}
1150
1151	data_stream *data = &inode->Node()->data;
1152
1153	// check the direct range
1154
1155	if (data->max_direct_range) {
1156		for (int32 i = 0; i < NUM_DIRECT_BLOCKS; i++) {
1157			if (data->direct[i].IsZero())
1158				break;
1159
1160			status = CheckBlockRun(data->direct[i], "direct", control);
1161			if (status < B_OK)
1162				return status;
1163		}
1164	}
1165
1166	CachedBlock cached(fVolume);
1167
1168	// check the indirect range
1169
1170	if (data->max_indirect_range) {
1171		status = CheckBlockRun(data->indirect, "indirect", control);
1172		if (status < B_OK)
1173			return status;
1174
1175		off_t block = fVolume->ToBlock(data->indirect);
1176
1177		for (int32 i = 0; i < data->indirect.Length(); i++) {
1178			block_run *runs = (block_run *)cached.SetTo(block + i);
1179			if (runs == NULL)
1180				RETURN_ERROR(B_IO_ERROR);
1181
1182			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1183			int32 index = 0;
1184			for (; index < runsPerBlock; index++) {
1185				if (runs[index].IsZero())
1186					break;
1187
1188				status = CheckBlockRun(runs[index], "indirect->run", control);
1189				if (status < B_OK)
1190					return status;
1191			}
1192			if (index < runsPerBlock)
1193				break;
1194		}
1195	}
1196
1197	// check the double indirect range
1198
1199	if (data->max_double_indirect_range) {
1200		status = CheckBlockRun(data->double_indirect, "double indirect", control);
1201		if (status < B_OK)
1202			return status;
1203
1204		int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1205		int32 runsPerArray = runsPerBlock << ARRAY_BLOCKS_SHIFT;
1206
1207		CachedBlock cachedDirect(fVolume);
1208		int32 maxIndirectIndex = (data->double_indirect.Length() << fVolume->BlockShift())
1209									/ sizeof(block_run);
1210
1211		for (int32 indirectIndex = 0; indirectIndex < maxIndirectIndex; indirectIndex++) {
1212			// get the indirect array block
1213			block_run *array = (block_run *)cached.SetTo(fVolume->ToBlock(data->double_indirect)
1214				+ indirectIndex / runsPerBlock);
1215			if (array == NULL)
1216				return B_IO_ERROR;
1217
1218			block_run indirect = array[indirectIndex % runsPerBlock];
1219			// are we finished yet?
1220			if (indirect.IsZero())
1221				return B_OK;
1222
1223			status = CheckBlockRun(indirect, "double indirect->runs", control);
1224			if (status < B_OK)
1225				return status;
1226
1227			int32 maxIndex = (indirect.Length() << fVolume->BlockShift()) / sizeof(block_run);
1228
1229			for (int32 index = 0; index < maxIndex; ) {
1230				block_run *runs = (block_run *)cachedDirect.SetTo(fVolume->ToBlock(indirect)
1231					+ index / runsPerBlock);
1232				if (runs == NULL)
1233					return B_IO_ERROR;
1234
1235				do {
1236					// are we finished yet?
1237					if (runs[index % runsPerBlock].IsZero())
1238						return B_OK;
1239
1240					status = CheckBlockRun(runs[index % runsPerBlock], "double indirect->runs->run", control);
1241					if (status < B_OK)
1242						return status;
1243				} while ((++index % runsPerArray) != 0);
1244			}
1245		}
1246	}
1247
1248	return B_OK;
1249}
1250
1251