1/*
2 * Copyright 2001-2008, Axel D��rfler, axeld@pinc-software.de. All Rights Reserved.
3 * This file may be used under the terms of the MIT License.
4 */
5
6//! superblock, mounting, etc.
7
8
9#include "Debug.h"
10#include "Volume.h"
11#include "Journal.h"
12#include "Inode.h"
13#include "Query.h"
14
15#include <util/kernel_cpp.h>
16#include <KernelExport.h>
17#include <Drivers.h>
18
19#include <stdlib.h>
20#include <stdio.h>
21#include <string.h>
22#include <ctype.h>
23
24
25static const int32 kDesiredAllocationGroups = 56;
26	// This is the number of allocation groups that will be tried
27	// to be given for newly initialized disks.
28	// That's only relevant for smaller disks, though, since any
29	// of today's disk sizes already reach the maximum length
30	// of an allocation group (65536 blocks).
31	// It seems to create appropriate numbers for smaller disks
32	// with this setting, though (i.e. you can create a 400 MB
33	// file on a 1 GB disk without the need for double indirect
34	// blocks).
35
36
37class DeviceOpener {
38	public:
39		DeviceOpener(const char *device, int mode);
40		~DeviceOpener();
41
42		int Open(const char *device, int mode);
43		status_t InitCache(off_t numBlocks);
44		void RemoveCache(int mode);
45
46		void Keep();
47
48		int Device() const { return fDevice; }
49
50		status_t GetSize(off_t *_size, uint32 *_blockSize = NULL);
51
52	private:
53		int		fDevice;
54		bool	fCached;
55};
56
57
58DeviceOpener::DeviceOpener(const char *device, int mode)
59	:
60	fCached(false)
61{
62	Open(device, mode);
63}
64
65
66DeviceOpener::~DeviceOpener()
67{
68	if (fDevice >= B_OK) {
69		close(fDevice);
70		if (fCached)
71			remove_cached_device_blocks(fDevice, NO_WRITES);
72	}
73}
74
75
76int
77DeviceOpener::Open(const char *device, int mode)
78{
79	fDevice = open(device, mode);
80	return fDevice;
81}
82
83
84status_t
85DeviceOpener::InitCache(off_t numBlocks)
86{
87	if (init_cache_for_device(fDevice, numBlocks) == B_OK) {
88		fCached = true;
89		return B_OK;
90	}
91
92	return B_ERROR;
93}
94
95
96void
97DeviceOpener::RemoveCache(int mode)
98{
99	if (!fCached)
100		return;
101
102	remove_cached_device_blocks(fDevice, mode);
103	fCached = false;
104}
105
106
107void
108DeviceOpener::Keep()
109{
110	fDevice = -1;
111}
112
113
114/** Returns the size of the device in bytes. It uses B_GET_GEOMETRY
115 *	to compute the size, or fstat() if that failed.
116 */
117
118status_t
119DeviceOpener::GetSize(off_t *_size, uint32 *_blockSize)
120{
121	device_geometry geometry;
122	if (ioctl(fDevice, B_GET_GEOMETRY, &geometry) < 0) {
123		// maybe it's just a file
124		struct stat stat;
125		if (fstat(fDevice, &stat) < 0)
126			return B_ERROR;
127
128		if (_size)
129			*_size = stat.st_size;
130		if (_blockSize)	// that shouldn't cause us any problems
131			*_blockSize = 512;
132
133		return B_OK;
134	}
135
136	if (_size) {
137		*_size = 1LL * geometry.head_count * geometry.cylinder_count
138					* geometry.sectors_per_track * geometry.bytes_per_sector;
139	}
140	if (_blockSize)
141		*_blockSize = geometry.bytes_per_sector;
142
143	return B_OK;
144}
145
146
147//	#pragma mark -
148
149
150bool
151disk_super_block::IsValid()
152{
153	if (Magic1() != (int32)SUPER_BLOCK_MAGIC1
154		|| Magic2() != (int32)SUPER_BLOCK_MAGIC2
155		|| Magic3() != (int32)SUPER_BLOCK_MAGIC3
156		|| (int32)block_size != inode_size
157		|| ByteOrder() != SUPER_BLOCK_FS_LENDIAN
158		|| (1UL << BlockShift()) != BlockSize()
159		|| AllocationGroups() < 1
160		|| AllocationGroupShift() < 1
161		|| BlocksPerAllocationGroup() < 1
162		|| NumBlocks() < 10
163		|| AllocationGroups() != divide_roundup(NumBlocks(),
164			1L << AllocationGroupShift()))
165		return false;
166
167	return true;
168}
169
170
171void
172disk_super_block::Initialize(const char *diskName, off_t numBlocks, uint32 blockSize)
173{
174	memset(this, 0, sizeof(disk_super_block));
175
176	magic1 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC1);
177	magic2 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC2);
178	magic3 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC3);
179	fs_byte_order = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_FS_LENDIAN);
180	flags = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_DISK_CLEAN);
181
182	strlcpy(name, diskName, sizeof(name));
183
184	int32 blockShift = 9;
185	while ((1UL << blockShift) < blockSize) {
186		blockShift++;
187	}
188
189	block_size = inode_size = HOST_ENDIAN_TO_BFS_INT32(blockSize);
190	block_shift = HOST_ENDIAN_TO_BFS_INT32(blockShift);
191
192	num_blocks = HOST_ENDIAN_TO_BFS_INT64(numBlocks);
193	used_blocks = 0;
194
195	// Get the minimum ag_shift (that's determined by the block size)
196
197	int32 bitsPerBlock = blockSize << 3;
198	off_t bitmapBlocks = (numBlocks + bitsPerBlock - 1) / bitsPerBlock;
199	int32 blocksPerGroup = 1;
200	int32 groupShift = 13;
201
202	for (int32 i = 8192; i < bitsPerBlock; i *= 2) {
203		groupShift++;
204	}
205
206	// Many allocation groups help applying allocation policies, but if
207	// they are too small, we will need to many block_runs to cover large
208	// files (see above to get an explanation of the kDesiredAllocationGroups
209	// constant).
210
211	int32 numGroups;
212
213	while (true) {
214		numGroups = (bitmapBlocks + blocksPerGroup - 1) / blocksPerGroup;
215		if (numGroups > kDesiredAllocationGroups) {
216			if (groupShift == 16)
217				break;
218
219			groupShift++;
220			blocksPerGroup *= 2;
221		} else
222			break;
223	}
224
225	num_ags = HOST_ENDIAN_TO_BFS_INT32(numGroups);
226	blocks_per_ag = HOST_ENDIAN_TO_BFS_INT32(blocksPerGroup);
227	ag_shift = HOST_ENDIAN_TO_BFS_INT32(groupShift);
228}
229
230
231//	#pragma mark -
232
233
234Volume::Volume(dev_t id)
235	:
236	fID(id),
237	fBlockAllocator(this),
238	fLock("bfs volume"),
239	fRootNode(NULL),
240	fIndicesNode(NULL),
241	fDirtyCachedBlocks(0),
242	fUniqueID(0),
243	fFlags(0)
244{
245}
246
247
248Volume::~Volume()
249{
250}
251
252
253bool
254Volume::IsValidSuperBlock()
255{
256	return fSuperBlock.IsValid();
257}
258
259
260void
261Volume::Panic()
262{
263	FATAL(("we have to panic... switch to read-only mode!\n"));
264	fFlags |= VOLUME_READ_ONLY;
265#ifdef USER
266	debugger("BFS panics!");
267#elif defined(DEBUG)
268	kernel_debugger("BFS panics!");
269#endif
270}
271
272
273status_t
274Volume::Mount(const char *deviceName, uint32 flags)
275{
276	if (flags & B_MOUNT_READ_ONLY)
277		fFlags |= VOLUME_READ_ONLY;
278
279	// ToDo: validate the FS in write mode as well!
280#if (B_HOST_IS_LENDIAN && defined(BFS_BIG_ENDIAN_ONLY)) \
281	|| (B_HOST_IS_BENDIAN && defined(BFS_LITTLE_ENDIAN_ONLY))
282	// in big endian mode, we only mount read-only for now
283	flags |= B_MOUNT_READ_ONLY;
284#endif
285
286	DeviceOpener opener(deviceName, flags & B_MOUNT_READ_ONLY ? O_RDONLY : O_RDWR);
287
288	// if we couldn't open the device, try read-only (don't rely on a specific error code)
289	if (opener.Device() < B_OK && (flags & B_MOUNT_READ_ONLY) == 0) {
290		opener.Open(deviceName, O_RDONLY);
291		fFlags |= VOLUME_READ_ONLY;
292	}
293
294	fDevice = opener.Device();
295	if (fDevice < B_OK)
296		RETURN_ERROR(fDevice);
297
298	// check if it's a regular file, and if so, disable the cache for the
299	// underlaying file system
300	struct stat stat;
301	if (fstat(fDevice, &stat) < 0)
302		RETURN_ERROR(B_ERROR);
303
304#ifndef NO_FILE_UNCACHED_IO
305	if (stat.st_mode & S_FILE && ioctl(fDevice, IOCTL_FILE_UNCACHED_IO, NULL) < 0) {
306		// mount read-only if the cache couldn't be disabled
307#	ifdef DEBUG
308		FATAL(("couldn't disable cache for image file - system may dead-lock!\n"));
309#	else
310		FATAL(("couldn't disable cache for image file!\n"));
311		Panic();
312#	endif
313	}
314#endif
315
316	// read the superblock
317	if (Identify(fDevice, &fSuperBlock) != B_OK) {
318		FATAL(("invalid superblock!\n"));
319		return B_BAD_VALUE;
320	}
321
322	// initialize short hands to the superblock (to save byte swapping)
323	fBlockSize = fSuperBlock.BlockSize();
324	fBlockShift = fSuperBlock.BlockShift();
325	fAllocationGroupShift = fSuperBlock.AllocationGroupShift();
326
327	// check if the device size is large enough to hold the file system
328	off_t diskSize;
329	if (opener.GetSize(&diskSize) < B_OK)
330		RETURN_ERROR(B_ERROR);
331	if (diskSize < (NumBlocks() << BlockShift()))
332		RETURN_ERROR(B_BAD_VALUE);
333
334	// set the current log pointers, so that journaling will work correctly
335	fLogStart = fSuperBlock.LogStart();
336	fLogEnd = fSuperBlock.LogEnd();
337
338	if (opener.InitCache(NumBlocks()) != B_OK)
339		return B_ERROR;
340
341	fJournal = new Journal(this);
342	// replaying the log is the first thing we will do on this disk
343	if (fJournal && fJournal->InitCheck() < B_OK
344		|| fBlockAllocator.Initialize() < B_OK) {
345		// ToDo: improve error reporting for a bad journal
346		FATAL(("could not initialize journal/block bitmap allocator!\n"));
347		return B_NO_MEMORY;
348	}
349
350	status_t status = B_OK;
351
352	fRootNode = new Inode(this, ToVnode(Root()));
353	if (fRootNode && fRootNode->InitCheck() == B_OK) {
354		status = new_vnode(fID, ToVnode(Root()), (void *)fRootNode);
355		if (status == B_OK) {
356			// try to get indices root dir
357
358			// question: why doesn't get_vnode() work here??
359			// answer: we have not yet backpropagated the pointer to the
360			// volume in bfs_mount(), so bfs_read_vnode() can't get it.
361			// But it's not needed to do that anyway.
362
363			if (!Indices().IsZero())
364				fIndicesNode = new Inode(this, ToVnode(Indices()));
365
366			if (fIndicesNode == NULL
367				|| fIndicesNode->InitCheck() < B_OK
368				|| !fIndicesNode->IsContainer()) {
369				INFORM(("bfs: volume doesn't have indices!\n"));
370
371				if (fIndicesNode) {
372					// if this is the case, the index root node is gone bad, and
373					// BFS switch to read-only mode
374					fFlags |= VOLUME_READ_ONLY;
375					delete fIndicesNode;
376					fIndicesNode = NULL;
377				}
378			}
379
380			// all went fine
381			opener.Keep();
382			return B_OK;
383		} else
384			FATAL(("could not create root node: new_vnode() failed!\n"));
385
386		delete fRootNode;
387	} else {
388		status = B_BAD_VALUE;
389		FATAL(("could not create root node!\n"));
390	}
391
392	return status;
393}
394
395
396status_t
397Volume::Unmount()
398{
399	// This will also flush the log & all blocks to disk
400	delete fJournal;
401	fJournal = NULL;
402
403	delete fIndicesNode;
404
405	remove_cached_device_blocks(fDevice, IsReadOnly() ? NO_WRITES : ALLOW_WRITES);
406	close(fDevice);
407
408	return B_OK;
409}
410
411
412status_t
413Volume::Sync()
414{
415	return fJournal->FlushLogAndBlocks();
416}
417
418
419status_t
420Volume::ValidateBlockRun(block_run run)
421{
422	if (run.AllocationGroup() < 0 || run.AllocationGroup() > (int32)AllocationGroups()
423		|| run.Start() > (1UL << AllocationGroupShift())
424		|| run.length == 0
425		|| uint32(run.Length() + run.Start()) > (1UL << AllocationGroupShift())) {
426		Panic();
427		FATAL(("*** invalid run(%ld,%d,%d)\n", run.AllocationGroup(), run.Start(), run.Length()));
428		return B_BAD_DATA;
429	}
430	return B_OK;
431}
432
433
434block_run
435Volume::ToBlockRun(off_t block) const
436{
437	block_run run;
438	run.allocation_group = HOST_ENDIAN_TO_BFS_INT32(block >> AllocationGroupShift());
439	run.start = HOST_ENDIAN_TO_BFS_INT16(block & ((1LL << AllocationGroupShift()) - 1));
440	run.length = HOST_ENDIAN_TO_BFS_INT16(1);
441	return run;
442}
443
444
445status_t
446Volume::CreateIndicesRoot(Transaction *transaction)
447{
448	off_t id;
449	status_t status = Inode::Create(transaction, NULL, NULL,
450		S_INDEX_DIR | S_STR_INDEX | S_DIRECTORY | 0700, 0, 0, &id, &fIndicesNode);
451	if (status < B_OK)
452		RETURN_ERROR(status);
453
454	fSuperBlock.indices = ToBlockRun(id);
455	return WriteSuperBlock();
456}
457
458
459status_t
460Volume::AllocateForInode(Transaction *transaction, const Inode *parent, mode_t type, block_run &run)
461{
462	return fBlockAllocator.AllocateForInode(transaction, &parent->BlockRun(), type, run);
463}
464
465
466status_t
467Volume::WriteSuperBlock()
468{
469	if (write_pos(fDevice, 512, &fSuperBlock, sizeof(disk_super_block)) != sizeof(disk_super_block))
470		return B_IO_ERROR;
471
472	return B_OK;
473}
474
475
476void
477Volume::UpdateLiveQueries(Inode *inode, const char *attribute, int32 type, const uint8 *oldKey,
478	size_t oldLength, const uint8 *newKey, size_t newLength)
479{
480	if (fQueryLock.Lock() < B_OK)
481		return;
482
483	Query *query = NULL;
484	while ((query = fQueries.Next(query)) != NULL)
485		query->LiveUpdate(inode, attribute, type, oldKey, oldLength, newKey, newLength);
486
487	fQueryLock.Unlock();
488}
489
490
491/** Checks if there is a live query whose results depend on the presence
492 *	or value of the specified attribute.
493 *	Don't use it if you already have all the data together to evaluate
494 *	the queries - it wouldn't safe you anything in this case.
495 */
496
497bool
498Volume::CheckForLiveQuery(const char *attribute)
499{
500	// ToDo: check for a live query that depends on the specified attribute
501	return true;
502}
503
504
505void
506Volume::AddQuery(Query *query)
507{
508	if (fQueryLock.Lock() < B_OK)
509		return;
510
511	fQueries.Add(query);
512
513	fQueryLock.Unlock();
514}
515
516
517void
518Volume::RemoveQuery(Query *query)
519{
520	if (fQueryLock.Lock() < B_OK)
521		return;
522
523	fQueries.Remove(query);
524
525	fQueryLock.Unlock();
526}
527
528
529//	#pragma mark -
530//	Disk scanning and initialization
531
532
533status_t
534Volume::Identify(int fd, disk_super_block *superBlock)
535{
536	char buffer[1024];
537	if (read_pos(fd, 0, buffer, sizeof(buffer)) != sizeof(buffer))
538		return B_IO_ERROR;
539
540	// Note: that does work only for x86, for PowerPC, the superblock
541	// may be located at offset 0!
542	memcpy(superBlock, buffer + 512, sizeof(disk_super_block));
543	if (!superBlock->IsValid()) {
544#ifndef BFS_LITTLE_ENDIAN_ONLY
545		memcpy(superBlock, buffer, sizeof(disk_super_block));
546		if (!superBlock->IsValid())
547			return B_BAD_VALUE;
548#else
549		return B_BAD_VALUE;
550#endif
551	}
552
553	return B_OK;
554}
555
556
557#ifdef USER
558extern "C" void kill_device_vnodes(dev_t id);
559	// This call is only available in the userland fs_shell
560
561status_t
562Volume::Initialize(const char *device, const char *name, uint32 blockSize, uint32 flags)
563{
564	// although there is no really good reason for it, we won't
565	// accept '/' in disk names (mkbfs does this, too - and since
566	// Tracker names mounted volumes like their name)
567	if (strchr(name, '/') != NULL)
568		return B_BAD_VALUE;
569
570	if (blockSize != 1024 && blockSize != 2048 && blockSize != 4096 && blockSize != 8192)
571		return B_BAD_VALUE;
572
573	DeviceOpener opener(device, O_RDWR);
574	if (opener.Device() < B_OK)
575		return B_BAD_VALUE;
576
577	fDevice = opener.Device();
578
579	uint32 deviceBlockSize;
580	off_t deviceSize;
581	if (opener.GetSize(&deviceSize, &deviceBlockSize) < B_OK)
582		return B_ERROR;
583
584	off_t numBlocks = deviceSize / blockSize;
585
586	// create valid superblock
587
588	fSuperBlock.Initialize(name, numBlocks, blockSize);
589
590	// initialize short hands to the superblock (to save byte swapping)
591	fBlockSize = fSuperBlock.BlockSize();
592	fBlockShift = fSuperBlock.BlockShift();
593	fAllocationGroupShift = fSuperBlock.AllocationGroupShift();
594
595	// since the allocator has not been initialized yet, we
596	// cannot use BlockAllocator::BitmapSize() here
597	fSuperBlock.log_blocks = ToBlockRun(AllocationGroups()
598		* fSuperBlock.BlocksPerAllocationGroup() + 1);
599	fSuperBlock.log_blocks.length = HOST_ENDIAN_TO_BFS_INT16(2048);
600		// ToDo: set the log size depending on the disk size
601	fSuperBlock.log_start = fSuperBlock.log_end = HOST_ENDIAN_TO_BFS_INT64(ToBlock(Log()));
602
603	// set the current log pointers, so that journaling will work correctly
604	fLogStart = fSuperBlock.LogStart();
605	fLogEnd = fSuperBlock.LogEnd();
606
607	if (!IsValidSuperBlock())
608		RETURN_ERROR(B_ERROR);
609
610	if (opener.InitCache(numBlocks) != B_OK)
611		return B_ERROR;
612
613	fJournal = new Journal(this);
614	if (fJournal == NULL || fJournal->InitCheck() < B_OK)
615		RETURN_ERROR(B_ERROR);
616
617	// ready to write data to disk
618
619	Transaction transaction(this, 0);
620
621	if (fBlockAllocator.InitializeAndClearBitmap(transaction) < B_OK)
622		RETURN_ERROR(B_ERROR);
623
624	off_t id;
625	status_t status = Inode::Create(&transaction, NULL, NULL,
626		S_DIRECTORY | 0755, 0, 0, &id, &fRootNode);
627	if (status < B_OK)
628		RETURN_ERROR(status);
629
630	fSuperBlock.root_dir = ToBlockRun(id);
631
632	if ((flags & VOLUME_NO_INDICES) == 0) {
633		// The indices root directory will be created automatically
634		// when the standard indices are created (or any other).
635		Index index(this);
636		status = index.Create(&transaction, "name", B_STRING_TYPE);
637		if (status < B_OK)
638			return status;
639
640		status = index.Create(&transaction, "last_modified", B_INT64_TYPE);
641		if (status < B_OK)
642			return status;
643
644		status = index.Create(&transaction, "size", B_INT64_TYPE);
645		if (status < B_OK)
646			return status;
647	}
648
649	WriteSuperBlock();
650	transaction.Done();
651
652	put_vnode(ID(), fRootNode->ID());
653	if (fIndicesNode != NULL)
654		put_vnode(ID(), fIndicesNode->ID());
655
656	kill_device_vnodes(ID());
657		// This call is only available in the userland fs_shell
658
659	Sync();
660	opener.RemoveCache(ALLOW_WRITES);
661	return B_OK;
662}
663#endif
664