1/*
2 * Copyright 2001-2012, Axel D��rfler, axeld@pinc-software.de.
3 * This file may be used under the terms of the MIT License.
4 */
5
6
7//! superblock, mounting, etc.
8
9
10#include "Attribute.h"
11#include "Debug.h"
12#include "Inode.h"
13#include "Journal.h"
14#include "Query.h"
15#include "Volume.h"
16
17
18static const int32 kDesiredAllocationGroups = 56;
19	// This is the number of allocation groups that will be tried
20	// to be given for newly initialized disks.
21	// That's only relevant for smaller disks, though, since any
22	// of today's disk sizes already reach the maximum length
23	// of an allocation group (65536 blocks).
24	// It seems to create appropriate numbers for smaller disks
25	// with this setting, though (i.e. you can create a 400 MB
26	// file on a 1 GB disk without the need for double indirect
27	// blocks).
28
29
30class DeviceOpener {
31public:
32						DeviceOpener(int fd, int mode);
33						DeviceOpener(const char* device, int mode);
34						~DeviceOpener();
35
36			int			Open(const char* device, int mode);
37			int			Open(int fd, int mode);
38			void*		InitCache(off_t numBlocks, uint32 blockSize);
39			void		RemoveCache(bool allowWrites);
40
41			void		Keep();
42
43			int			Device() const { return fDevice; }
44			int			Mode() const { return fMode; }
45			bool		IsReadOnly() const { return _IsReadOnly(fMode); }
46
47			status_t	GetSize(off_t* _size, uint32* _blockSize = NULL);
48
49private:
50	static	bool		_IsReadOnly(int mode)
51							{ return (mode & O_RWMASK) == O_RDONLY;}
52	static	bool		_IsReadWrite(int mode)
53							{ return (mode & O_RWMASK) == O_RDWR;}
54
55			int			fDevice;
56			int			fMode;
57			void*		fBlockCache;
58};
59
60
61DeviceOpener::DeviceOpener(const char* device, int mode)
62	:
63	fBlockCache(NULL)
64{
65	Open(device, mode);
66}
67
68
69DeviceOpener::DeviceOpener(int fd, int mode)
70	:
71	fBlockCache(NULL)
72{
73	Open(fd, mode);
74}
75
76
77DeviceOpener::~DeviceOpener()
78{
79	if (fDevice >= 0) {
80		RemoveCache(false);
81		close(fDevice);
82	}
83}
84
85
86int
87DeviceOpener::Open(const char* device, int mode)
88{
89	fDevice = open(device, mode | O_NOCACHE);
90	if (fDevice < 0)
91		fDevice = errno;
92
93	if (fDevice < 0 && _IsReadWrite(mode)) {
94		// try again to open read-only (don't rely on a specific error code)
95		return Open(device, O_RDONLY | O_NOCACHE);
96	}
97
98	if (fDevice >= 0) {
99		// opening succeeded
100		fMode = mode;
101		if (_IsReadWrite(mode)) {
102			// check out if the device really allows for read/write access
103			device_geometry geometry;
104			if (!ioctl(fDevice, B_GET_GEOMETRY, &geometry)) {
105				if (geometry.read_only) {
106					// reopen device read-only
107					close(fDevice);
108					return Open(device, O_RDONLY | O_NOCACHE);
109				}
110			}
111		}
112	}
113
114	return fDevice;
115}
116
117
118int
119DeviceOpener::Open(int fd, int mode)
120{
121	fDevice = dup(fd);
122	if (fDevice < 0)
123		return errno;
124
125	fMode = mode;
126
127	return fDevice;
128}
129
130
131void*
132DeviceOpener::InitCache(off_t numBlocks, uint32 blockSize)
133{
134	return fBlockCache = block_cache_create(fDevice, numBlocks, blockSize,
135		IsReadOnly());
136}
137
138
139void
140DeviceOpener::RemoveCache(bool allowWrites)
141{
142	if (fBlockCache == NULL)
143		return;
144
145	block_cache_delete(fBlockCache, allowWrites);
146	fBlockCache = NULL;
147}
148
149
150void
151DeviceOpener::Keep()
152{
153	fDevice = -1;
154}
155
156
157/*!	Returns the size of the device in bytes. It uses B_GET_GEOMETRY
158	to compute the size, or fstat() if that failed.
159*/
160status_t
161DeviceOpener::GetSize(off_t* _size, uint32* _blockSize)
162{
163	device_geometry geometry;
164	if (ioctl(fDevice, B_GET_GEOMETRY, &geometry) < 0) {
165		// maybe it's just a file
166		struct stat stat;
167		if (fstat(fDevice, &stat) < 0)
168			return B_ERROR;
169
170		if (_size)
171			*_size = stat.st_size;
172		if (_blockSize)	// that shouldn't cause us any problems
173			*_blockSize = 512;
174
175		return B_OK;
176	}
177
178	if (_size) {
179		*_size = 1LL * geometry.head_count * geometry.cylinder_count
180			* geometry.sectors_per_track * geometry.bytes_per_sector;
181	}
182	if (_blockSize)
183		*_blockSize = geometry.bytes_per_sector;
184
185	return B_OK;
186}
187
188
189//	#pragma mark -
190
191
192bool
193disk_super_block::IsValid() const
194{
195	if (Magic1() != (int32)SUPER_BLOCK_MAGIC1
196		|| Magic2() != (int32)SUPER_BLOCK_MAGIC2
197		|| Magic3() != (int32)SUPER_BLOCK_MAGIC3
198		|| (int32)block_size != inode_size
199		|| ByteOrder() != SUPER_BLOCK_FS_LENDIAN
200		|| (1UL << BlockShift()) != BlockSize()
201		|| AllocationGroups() < 1
202		|| AllocationGroupShift() < 1
203		|| BlocksPerAllocationGroup() < 1
204		|| NumBlocks() < 10
205		|| AllocationGroups() != divide_roundup(NumBlocks(),
206			1L << AllocationGroupShift()))
207		return false;
208
209	return true;
210}
211
212
213void
214disk_super_block::Initialize(const char* diskName, off_t numBlocks,
215	uint32 blockSize)
216{
217	memset(this, 0, sizeof(disk_super_block));
218
219	magic1 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC1);
220	magic2 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC2);
221	magic3 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC3);
222	fs_byte_order = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_FS_LENDIAN);
223	flags = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_DISK_CLEAN);
224
225	strlcpy(name, diskName, sizeof(name));
226
227	int32 blockShift = 9;
228	while ((1UL << blockShift) < blockSize) {
229		blockShift++;
230	}
231
232	block_size = inode_size = HOST_ENDIAN_TO_BFS_INT32(blockSize);
233	block_shift = HOST_ENDIAN_TO_BFS_INT32(blockShift);
234
235	num_blocks = HOST_ENDIAN_TO_BFS_INT64(numBlocks);
236	used_blocks = 0;
237
238	// Get the minimum ag_shift (that's determined by the block size)
239
240	int32 bitsPerBlock = blockSize << 3;
241	off_t bitmapBlocks = (numBlocks + bitsPerBlock - 1) / bitsPerBlock;
242	int32 blocksPerGroup = 1;
243	int32 groupShift = 13;
244
245	for (int32 i = 8192; i < bitsPerBlock; i *= 2) {
246		groupShift++;
247	}
248
249	// Many allocation groups help applying allocation policies, but if
250	// they are too small, we will need to many block_runs to cover large
251	// files (see above to get an explanation of the kDesiredAllocationGroups
252	// constant).
253
254	int32 numGroups;
255
256	while (true) {
257		numGroups = (bitmapBlocks + blocksPerGroup - 1) / blocksPerGroup;
258		if (numGroups > kDesiredAllocationGroups) {
259			if (groupShift == 16)
260				break;
261
262			groupShift++;
263			blocksPerGroup *= 2;
264		} else
265			break;
266	}
267
268	num_ags = HOST_ENDIAN_TO_BFS_INT32(numGroups);
269	blocks_per_ag = HOST_ENDIAN_TO_BFS_INT32(blocksPerGroup);
270	ag_shift = HOST_ENDIAN_TO_BFS_INT32(groupShift);
271}
272
273
274//	#pragma mark -
275
276
277Volume::Volume(fs_volume* volume)
278	:
279	fVolume(volume),
280	fBlockAllocator(this),
281	fRootNode(NULL),
282	fIndicesNode(NULL),
283	fDirtyCachedBlocks(0),
284	fFlags(0),
285	fCheckingThread(-1)
286{
287	mutex_init(&fLock, "bfs volume");
288	mutex_init(&fQueryLock, "bfs queries");
289}
290
291
292Volume::~Volume()
293{
294	mutex_destroy(&fQueryLock);
295	mutex_destroy(&fLock);
296}
297
298
299bool
300Volume::IsValidSuperBlock() const
301{
302	return fSuperBlock.IsValid();
303}
304
305
306/*!	Checks whether the given block number may be the location of an inode block.
307*/
308bool
309Volume::IsValidInodeBlock(off_t block) const
310{
311	return block > fSuperBlock.LogEnd() && block < NumBlocks();
312}
313
314
315void
316Volume::Panic()
317{
318	FATAL(("Disk corrupted... switch to read-only mode!\n"));
319	fFlags |= VOLUME_READ_ONLY;
320#if KDEBUG
321	kernel_debugger("BFS panics!");
322#endif
323}
324
325
326status_t
327Volume::Mount(const char* deviceName, uint32 flags)
328{
329	// TODO: validate the FS in write mode as well!
330#if (B_HOST_IS_LENDIAN && defined(BFS_BIG_ENDIAN_ONLY)) \
331	|| (B_HOST_IS_BENDIAN && defined(BFS_LITTLE_ENDIAN_ONLY))
332	// in big endian mode, we only mount read-only for now
333	flags |= B_MOUNT_READ_ONLY;
334#endif
335
336	DeviceOpener opener(deviceName, (flags & B_MOUNT_READ_ONLY) != 0
337		? O_RDONLY : O_RDWR);
338	fDevice = opener.Device();
339	if (fDevice < B_OK)
340		RETURN_ERROR(fDevice);
341
342	if (opener.IsReadOnly())
343		fFlags |= VOLUME_READ_ONLY;
344
345	// read the superblock
346	if (Identify(fDevice, &fSuperBlock) != B_OK) {
347		FATAL(("invalid superblock!\n"));
348		return B_BAD_VALUE;
349	}
350
351	// initialize short hands to the superblock (to save byte swapping)
352	fBlockSize = fSuperBlock.BlockSize();
353	fBlockShift = fSuperBlock.BlockShift();
354	fAllocationGroupShift = fSuperBlock.AllocationGroupShift();
355
356	// check if the device size is large enough to hold the file system
357	off_t diskSize;
358	if (opener.GetSize(&diskSize, &fDeviceBlockSize) != B_OK)
359		RETURN_ERROR(B_ERROR);
360	if (diskSize < (NumBlocks() << BlockShift()))
361		RETURN_ERROR(B_BAD_VALUE);
362
363	// set the current log pointers, so that journaling will work correctly
364	fLogStart = fSuperBlock.LogStart();
365	fLogEnd = fSuperBlock.LogEnd();
366
367	if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL)
368		return B_ERROR;
369
370	fJournal = new(std::nothrow) Journal(this);
371	if (fJournal == NULL)
372		return B_NO_MEMORY;
373
374	status_t status = fJournal->InitCheck();
375	if (status < B_OK) {
376		FATAL(("could not initialize journal: %s!\n", strerror(status)));
377		return status;
378	}
379
380	// replaying the log is the first thing we will do on this disk
381	status = fJournal->ReplayLog();
382	if (status != B_OK) {
383		FATAL(("Replaying log failed, data may be corrupted, volume "
384			"read-only.\n"));
385		fFlags |= VOLUME_READ_ONLY;
386			// TODO: if this is the boot volume, Bootscript will assume this
387			// is a CD...
388			// TODO: it would be nice to have a user visible alert instead
389			// of letting him just find this in the syslog.
390	}
391
392	status = fBlockAllocator.Initialize();
393	if (status != B_OK) {
394		FATAL(("could not initialize block bitmap allocator!\n"));
395		return status;
396	}
397
398	fRootNode = new(std::nothrow) Inode(this, ToVnode(Root()));
399	if (fRootNode != NULL && fRootNode->InitCheck() == B_OK) {
400		status = publish_vnode(fVolume, ToVnode(Root()), (void*)fRootNode,
401			&gBFSVnodeOps, fRootNode->Mode(), 0);
402		if (status == B_OK) {
403			// try to get indices root dir
404
405			if (!Indices().IsZero()) {
406				fIndicesNode = new(std::nothrow) Inode(this,
407					ToVnode(Indices()));
408			}
409
410			if (fIndicesNode == NULL
411				|| fIndicesNode->InitCheck() < B_OK
412				|| !fIndicesNode->IsContainer()) {
413				INFORM(("bfs: volume doesn't have indices!\n"));
414
415				if (fIndicesNode) {
416					// if this is the case, the index root node is gone bad,
417					// and BFS switch to read-only mode
418					fFlags |= VOLUME_READ_ONLY;
419					delete fIndicesNode;
420					fIndicesNode = NULL;
421				}
422			} else {
423				// we don't use the vnode layer to access the indices node
424			}
425		} else {
426			FATAL(("could not create root node: publish_vnode() failed!\n"));
427			delete fRootNode;
428			return status;
429		}
430	} else {
431		status = B_BAD_VALUE;
432		FATAL(("could not create root node!\n"));
433		return status;
434	}
435
436	// all went fine
437	opener.Keep();
438	return B_OK;
439}
440
441
442status_t
443Volume::Unmount()
444{
445	put_vnode(fVolume, ToVnode(Root()));
446
447	fBlockAllocator.Uninitialize();
448
449	// This will also flush the log & all blocks to disk
450	delete fJournal;
451	fJournal = NULL;
452
453	delete fIndicesNode;
454
455	block_cache_delete(fBlockCache, !IsReadOnly());
456	close(fDevice);
457
458	return B_OK;
459}
460
461
462status_t
463Volume::Sync()
464{
465	return fJournal->FlushLogAndBlocks();
466}
467
468
469status_t
470Volume::ValidateBlockRun(block_run run)
471{
472	if (run.AllocationGroup() < 0
473		|| run.AllocationGroup() > (int32)AllocationGroups()
474		|| run.Start() > (1UL << AllocationGroupShift())
475		|| run.length == 0
476		|| uint32(run.Length() + run.Start())
477				> (1UL << AllocationGroupShift())) {
478		Panic();
479		FATAL(("*** invalid run(%d,%d,%d)\n", (int)run.AllocationGroup(),
480			run.Start(), run.Length()));
481		return B_BAD_DATA;
482	}
483	return B_OK;
484}
485
486
487block_run
488Volume::ToBlockRun(off_t block) const
489{
490	block_run run;
491	run.allocation_group = HOST_ENDIAN_TO_BFS_INT32(
492		block >> AllocationGroupShift());
493	run.start = HOST_ENDIAN_TO_BFS_INT16(
494		block & ((1LL << AllocationGroupShift()) - 1));
495	run.length = HOST_ENDIAN_TO_BFS_INT16(1);
496	return run;
497}
498
499
500status_t
501Volume::CreateIndicesRoot(Transaction& transaction)
502{
503	off_t id;
504	status_t status = Inode::Create(transaction, NULL, NULL,
505		S_INDEX_DIR | S_STR_INDEX | S_DIRECTORY | 0700, 0, 0, NULL, &id,
506		&fIndicesNode);
507	if (status < B_OK)
508		RETURN_ERROR(status);
509
510	fSuperBlock.indices = ToBlockRun(id);
511	return WriteSuperBlock();
512}
513
514
515status_t
516Volume::CreateVolumeID(Transaction& transaction)
517{
518	Attribute attr(fRootNode);
519	status_t status;
520	attr_cookie* cookie;
521	status = attr.Create("be:volume_id", B_UINT64_TYPE, O_RDWR, &cookie);
522	if (status == B_OK) {
523		static bool seeded = false;
524		if (!seeded) {
525			// seed the random number generator for the be:volume_id attribute.
526			srand(time(NULL));
527			seeded = true;
528		}
529		uint64_t id;
530		size_t length = sizeof(id);
531		id = ((uint64_t)rand() << 32) | rand();
532		attr.Write(transaction, cookie, 0, (uint8_t *)&id, &length, NULL);
533	}
534	return status;
535}
536
537
538
539status_t
540Volume::AllocateForInode(Transaction& transaction, const Inode* parent,
541	mode_t type, block_run& run)
542{
543	return fBlockAllocator.AllocateForInode(transaction, &parent->BlockRun(),
544		type, run);
545}
546
547
548status_t
549Volume::WriteSuperBlock()
550{
551	if (write_pos(fDevice, 512, &fSuperBlock, sizeof(disk_super_block))
552			!= sizeof(disk_super_block))
553		return B_IO_ERROR;
554
555	return B_OK;
556}
557
558
559void
560Volume::UpdateLiveQueries(Inode* inode, const char* attribute, int32 type,
561	const uint8* oldKey, size_t oldLength, const uint8* newKey,
562	size_t newLength)
563{
564	MutexLocker _(fQueryLock);
565
566	SinglyLinkedList<Query>::Iterator iterator = fQueries.GetIterator();
567	while (iterator.HasNext()) {
568		Query* query = iterator.Next();
569		query->LiveUpdate(inode, attribute, type, oldKey, oldLength, newKey,
570			newLength);
571	}
572}
573
574
575void
576Volume::UpdateLiveQueriesRenameMove(Inode* inode, ino_t oldDirectoryID,
577	const char* oldName, ino_t newDirectoryID, const char* newName)
578{
579	MutexLocker _(fQueryLock);
580
581	size_t oldLength = strlen(oldName);
582	size_t newLength = strlen(newName);
583
584	SinglyLinkedList<Query>::Iterator iterator = fQueries.GetIterator();
585	while (iterator.HasNext()) {
586		Query* query = iterator.Next();
587		query->LiveUpdateRenameMove(inode, oldDirectoryID, oldName, oldLength,
588			newDirectoryID, newName, newLength);
589	}
590}
591
592
593/*!	Checks if there is a live query whose results depend on the presence
594	or value of the specified attribute.
595	Don't use it if you already have all the data together to evaluate
596	the queries - it wouldn't safe you anything in this case.
597*/
598bool
599Volume::CheckForLiveQuery(const char* attribute)
600{
601	// TODO: check for a live query that depends on the specified attribute
602	return true;
603}
604
605
606void
607Volume::AddQuery(Query* query)
608{
609	MutexLocker _(fQueryLock);
610	fQueries.Add(query);
611}
612
613
614void
615Volume::RemoveQuery(Query* query)
616{
617	MutexLocker _(fQueryLock);
618	fQueries.Remove(query);
619}
620
621
622//	#pragma mark - Disk scanning and initialization
623
624
625/*static*/ status_t
626Volume::CheckSuperBlock(const uint8* data, uint32* _offset)
627{
628	disk_super_block* superBlock = (disk_super_block*)(data + 512);
629	if (superBlock->IsValid()) {
630		if (_offset != NULL)
631			*_offset = 512;
632		return B_OK;
633	}
634
635#ifndef BFS_LITTLE_ENDIAN_ONLY
636	// For PPC, the superblock might be located at offset 0
637	superBlock = (disk_super_block*)data;
638	if (superBlock->IsValid()) {
639		if (_offset != NULL)
640			*_offset = 0;
641		return B_OK;
642	}
643#endif
644
645	return B_BAD_VALUE;
646}
647
648
649/*static*/ status_t
650Volume::Identify(int fd, disk_super_block* superBlock)
651{
652	uint8 buffer[1024];
653	if (read_pos(fd, 0, buffer, sizeof(buffer)) != sizeof(buffer))
654		return B_IO_ERROR;
655
656	uint32 offset;
657	if (CheckSuperBlock(buffer, &offset) != B_OK)
658		return B_BAD_VALUE;
659
660	memcpy(superBlock, buffer + offset, sizeof(disk_super_block));
661	return B_OK;
662}
663
664
665status_t
666Volume::Initialize(int fd, const char* name, uint32 blockSize,
667	uint32 flags)
668{
669	// although there is no really good reason for it, we won't
670	// accept '/' in disk names (mkbfs does this, too - and since
671	// Tracker names mounted volumes like their name)
672	if (strchr(name, '/') != NULL)
673		return B_BAD_VALUE;
674
675	if (blockSize != 1024 && blockSize != 2048 && blockSize != 4096
676		&& blockSize != 8192)
677		return B_BAD_VALUE;
678
679	DeviceOpener opener(fd, O_RDWR);
680	if (opener.Device() < B_OK)
681		return B_BAD_VALUE;
682
683	if (opener.IsReadOnly())
684		return B_READ_ONLY_DEVICE;
685
686	fDevice = opener.Device();
687
688	uint32 deviceBlockSize;
689	off_t deviceSize;
690	if (opener.GetSize(&deviceSize, &deviceBlockSize) < B_OK)
691		return B_ERROR;
692
693	off_t numBlocks = deviceSize / blockSize;
694
695	// create valid superblock
696
697	fSuperBlock.Initialize(name, numBlocks, blockSize);
698
699	// initialize short hands to the superblock (to save byte swapping)
700	fBlockSize = fSuperBlock.BlockSize();
701	fBlockShift = fSuperBlock.BlockShift();
702	fAllocationGroupShift = fSuperBlock.AllocationGroupShift();
703
704	// determine log size depending on the size of the volume
705	off_t logSize = 2048;
706	if (numBlocks <= 20480)
707		logSize = 512;
708	if (deviceSize > 1LL * 1024 * 1024 * 1024)
709		logSize = 4096;
710
711	// since the allocator has not been initialized yet, we
712	// cannot use BlockAllocator::BitmapSize() here
713	off_t bitmapBlocks = (numBlocks + blockSize * 8 - 1) / (blockSize * 8);
714
715	fSuperBlock.log_blocks = ToBlockRun(bitmapBlocks + 1);
716	fSuperBlock.log_blocks.length = HOST_ENDIAN_TO_BFS_INT16(logSize);
717	fSuperBlock.log_start = fSuperBlock.log_end = HOST_ENDIAN_TO_BFS_INT64(
718		ToBlock(Log()));
719
720	// set the current log pointers, so that journaling will work correctly
721	fLogStart = fSuperBlock.LogStart();
722	fLogEnd = fSuperBlock.LogEnd();
723
724	if (!IsValidSuperBlock())
725		RETURN_ERROR(B_ERROR);
726
727	if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL)
728		return B_ERROR;
729
730	fJournal = new(std::nothrow) Journal(this);
731	if (fJournal == NULL || fJournal->InitCheck() < B_OK)
732		RETURN_ERROR(B_ERROR);
733
734	// ready to write data to disk
735
736	Transaction transaction(this, 0);
737
738	if (fBlockAllocator.InitializeAndClearBitmap(transaction) < B_OK)
739		RETURN_ERROR(B_ERROR);
740
741	off_t id;
742	status_t status = Inode::Create(transaction, NULL, NULL,
743		S_DIRECTORY | 0755, 0, 0, NULL, &id, &fRootNode);
744	if (status < B_OK)
745		RETURN_ERROR(status);
746
747	fSuperBlock.root_dir = ToBlockRun(id);
748
749	if ((flags & VOLUME_NO_INDICES) == 0) {
750		// The indices root directory will be created automatically
751		// when the standard indices are created (or any other).
752		Index index(this);
753		status = index.Create(transaction, "name", B_STRING_TYPE);
754		if (status < B_OK)
755			return status;
756
757		status = index.Create(transaction, "BEOS:APP_SIG", B_STRING_TYPE);
758		if (status < B_OK)
759			return status;
760
761		status = index.Create(transaction, "last_modified", B_INT64_TYPE);
762		if (status < B_OK)
763			return status;
764
765		status = index.Create(transaction, "size", B_INT64_TYPE);
766		if (status < B_OK)
767			return status;
768	}
769
770	CreateVolumeID(transaction);
771
772	WriteSuperBlock();
773	transaction.Done();
774
775	Sync();
776	opener.RemoveCache(true);
777	return B_OK;
778}
779