1/*
2 * Copyright (c) 2000-2011 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/param.h>
30#include <sys/systm.h>
31#include <sys/buf.h>
32#include <sys/buf_internal.h>
33#include <sys/kernel.h>
34#include <sys/malloc.h>
35#include <sys/mount.h>
36#include <sys/vnode.h>
37
38
39#include "hfs.h"
40#include "hfs_cnode.h"
41#include "hfs_dbg.h"
42#include "hfs_endian.h"
43#include "hfs_btreeio.h"
44
45#include "hfscommon/headers/FileMgrInternal.h"
46#include "hfscommon/headers/BTreesPrivate.h"
47
48#define FORCESYNCBTREEWRITES 0
49
50/* From bsd/vfs/vfs_bio.c */
51extern int bdwrite_internal(struct buf *, int);
52
53static int ClearBTNodes(struct vnode *vp, long blksize, off_t offset, off_t amount);
54static int btree_journal_modify_block_end(struct hfsmount *hfsmp, struct buf *bp);
55
56void btree_swap_node(struct buf *bp, __unused void *arg);
57
58/*
59 * Return btree node size for given vnode.
60 *
61 * Returns:
62 * 	For btree vnode, returns btree node size.
63 * 	For non-btree vnodes, returns 0.
64 */
65u_int16_t get_btree_nodesize(struct vnode *vp)
66{
67	BTreeControlBlockPtr btree;
68	u_int16_t node_size = 0;
69
70	if (vnode_issystem(vp)) {
71		btree = (BTreeControlBlockPtr) VTOF(vp)->fcbBTCBPtr;
72		if (btree) {
73			node_size = btree->nodeSize;
74		}
75	}
76
77	return node_size;
78}
79
80OSStatus SetBTreeBlockSize(FileReference vp, ByteCount blockSize, __unused ItemCount minBlockCount)
81{
82	BTreeControlBlockPtr	bTreePtr;
83
84	DBG_ASSERT(vp != NULL);
85	DBG_ASSERT(blockSize >= kMinNodeSize);
86    if (blockSize > MAXBSIZE )
87        return (fsBTBadNodeSize);
88
89	bTreePtr = (BTreeControlBlockPtr)VTOF(vp)->fcbBTCBPtr;
90	bTreePtr->nodeSize = blockSize;
91
92    return (E_NONE);
93}
94
95
96OSStatus GetBTreeBlock(FileReference vp, u_int32_t blockNum, GetBlockOptions options, BlockDescriptor *block)
97{
98    OSStatus	 retval = E_NONE;
99    struct buf   *bp = NULL;
100	u_int8_t     allow_empty_node;
101
102	/* If the btree block is being read using hint, it is
103	 * fine for the swap code to find zeroed out nodes.
104	 */
105	if (options & kGetBlockHint) {
106			allow_empty_node = true;
107	} else {
108			allow_empty_node = false;
109	}
110
111    if (options & kGetEmptyBlock) {
112        daddr64_t blkno;
113        off_t offset;
114
115        offset = (daddr64_t)blockNum * (daddr64_t)block->blockSize;
116        bp = buf_getblk(vp, (daddr64_t)blockNum, block->blockSize, 0, 0, BLK_META);
117        if (bp &&
118            VNOP_BLOCKMAP(vp, offset, block->blockSize, &blkno, NULL, NULL, 0, NULL) == 0) {
119            buf_setblkno(bp, blkno);
120        }
121    } else {
122        retval = buf_meta_bread(vp, (daddr64_t)blockNum, block->blockSize, NOCRED, &bp);
123    }
124    if (bp == NULL)
125        retval = -1;	//XXX need better error
126
127    if (retval == E_NONE) {
128        block->blockHeader = bp;
129        block->buffer = (char *)buf_dataptr(bp);
130    	block->blockNum = buf_lblkno(bp);
131        block->blockReadFromDisk = (buf_fromcache(bp) == 0);	/* not found in cache ==> came from disk */
132
133		// XXXdbg
134		block->isModified = 0;
135
136        /* Check and endian swap B-Tree node (only if it's a valid block) */
137        if (!(options & kGetEmptyBlock)) {
138            /* This happens when we first open the b-tree, we might not have all the node data on hand */
139            if ((((BTNodeDescriptor *)block->buffer)->kind == kBTHeaderNode) &&
140                (((BTHeaderRec *)((char *)block->buffer + 14))->nodeSize != buf_count(bp)) &&
141                (SWAP_BE16 (((BTHeaderRec *)((char *)block->buffer + 14))->nodeSize) != buf_count(bp))) {
142
143                /*
144                 * Don't swap the node descriptor, record offsets, or other records.
145                 * This record will be invalidated and re-read with the correct node
146                 * size once the B-tree control block is set up with the node size
147                 * from the header record.
148                 */
149                retval = hfs_swap_BTNode (block, vp, kSwapBTNodeHeaderRecordOnly, allow_empty_node);
150
151			} else if (block->blockReadFromDisk) {
152            	/*
153            	 * The node was just read from disk, so always swap/check it.
154            	 * This is necessary on big endian since the test below won't trigger.
155            	 */
156                retval = hfs_swap_BTNode (block, vp, kSwapBTNodeBigToHost, allow_empty_node);
157            } else if (*((u_int16_t *)((char *)block->buffer + (block->blockSize - sizeof (u_int16_t)))) == 0x0e00) {
158				/*
159				 * The node was left in the cache in non-native order, so swap it.
160				 * This only happens on little endian, after the node is written
161				 * back to disk.
162				 */
163                retval = hfs_swap_BTNode (block, vp, kSwapBTNodeBigToHost, allow_empty_node);
164            }
165
166    		/*
167    		 * If we got an error, then the node is only partially swapped.
168    		 * We mark the buffer invalid so that the next attempt to get the
169    		 * node will read it and attempt to swap again, and will notice
170    		 * the error again.  If we didn't do this, the next attempt to get
171    		 * the node might use the partially swapped node as-is.
172    		 */
173            if (retval)
174				buf_markinvalid(bp);
175        }
176    }
177
178    if (retval) {
179    	if (bp)
180			buf_brelse(bp);
181        block->blockHeader = NULL;
182        block->buffer = NULL;
183    }
184
185    return (retval);
186}
187
188
189void ModifyBlockStart(FileReference vp, BlockDescPtr blockPtr)
190{
191	struct hfsmount	*hfsmp = VTOHFS(vp);
192    struct buf *bp = NULL;
193
194	if (hfsmp->jnl == NULL) {
195		return;
196	}
197
198    bp = (struct buf *) blockPtr->blockHeader;
199    if (bp == NULL) {
200		panic("hfs: ModifyBlockStart: null bp  for blockdescptr %p?!?\n", blockPtr);
201		return;
202    }
203
204	journal_modify_block_start(hfsmp->jnl, bp);
205	blockPtr->isModified = 1;
206}
207
208void
209btree_swap_node(struct buf *bp, __unused void *arg)
210{
211    //	struct hfsmount *hfsmp = (struct hfsmount *)arg;
212	int retval;
213    struct vnode *vp = buf_vnode(bp);
214    BlockDescriptor block;
215
216    /* Prepare the block pointer */
217    block.blockHeader = bp;
218    block.buffer = (char *)buf_dataptr(bp);
219    block.blockNum = buf_lblkno(bp);
220    /* not found in cache ==> came from disk */
221    block.blockReadFromDisk = (buf_fromcache(bp) == 0);
222    block.blockSize = buf_count(bp);
223
224    /* Swap the data now that this node is ready to go to disk.
225     * We allow swapping of zeroed out nodes here because we might
226     * be writing node whose last record just got deleted.
227     */
228    retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, true);
229    if (retval)
230    	panic("hfs: btree_swap_node: about to write corrupt node!\n");
231}
232
233
234static int
235btree_journal_modify_block_end(struct hfsmount *hfsmp, struct buf *bp)
236{
237    return journal_modify_block_end(hfsmp->jnl, bp, btree_swap_node, hfsmp);
238}
239
240
241OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, ReleaseBlockOptions options)
242{
243    struct hfsmount	*hfsmp = VTOHFS(vp);
244    OSStatus	retval = E_NONE;
245    struct buf *bp = NULL;
246
247    bp = (struct buf *) blockPtr->blockHeader;
248
249    if (bp == NULL) {
250        retval = -1;
251        goto exit;
252    }
253
254    if (options & kTrashBlock) {
255                buf_markinvalid(bp);
256
257		if (hfsmp->jnl && (buf_flags(bp) & B_LOCKED)) {
258			journal_kill_block(hfsmp->jnl, bp);
259		} else {
260			buf_brelse(bp);	/* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */
261		}
262
263		/* Don't let anyone else try to use this bp, it's been consumed */
264		blockPtr->blockHeader = NULL;
265
266    } else {
267        if (options & kForceWriteBlock) {
268			if (hfsmp->jnl) {
269				if (blockPtr->isModified == 0) {
270					panic("hfs: releaseblock: modified is 0 but forcewrite set! bp %p\n", bp);
271				}
272
273				retval = btree_journal_modify_block_end(hfsmp, bp);
274				blockPtr->isModified = 0;
275			} else {
276				retval = VNOP_BWRITE(bp);
277			}
278
279			/* Don't let anyone else try to use this bp, it's been consumed */
280			blockPtr->blockHeader = NULL;
281
282        } else if (options & kMarkBlockDirty) {
283			struct timeval tv;
284			microuptime(&tv);
285            if ((options & kLockTransaction) && hfsmp->jnl == NULL) {
286                /*
287                 *
288                 * Set the B_LOCKED flag and unlock the buffer, causing buf_brelse to move
289                 * the buffer onto the LOCKED free list.  This is necessary, otherwise
290                 * getnewbuf() would try to reclaim the buffers using buf_bawrite, which
291                 * isn't going to work.
292                 *
293                 */
294                /* Don't hog all the buffers... */
295                if (count_lock_queue() > kMaxLockedMetaBuffers) {
296                     hfs_btsync(vp, HFS_SYNCTRANS);
297                     /* Rollback sync time to cause a sync on lock release... */
298                     (void) BTSetLastSync(VTOF(vp), tv.tv_sec - (kMaxSecsForFsync + 1));
299                }
300		buf_setflags(bp, B_LOCKED);
301            }
302
303            /*
304             * Delay-write this block.
305             * If the maximum delayed buffers has been exceeded then
306             * free up some buffers and fall back to an asynchronous write.
307             */
308			if (hfsmp->jnl) {
309				if (blockPtr->isModified == 0) {
310					panic("hfs: releaseblock: modified is 0 but markdirty set! bp %p\n", bp);
311				}
312				retval = btree_journal_modify_block_end(hfsmp, bp);
313				blockPtr->isModified = 0;
314			} else if (bdwrite_internal(bp, 1) != 0) {
315                hfs_btsync(vp, 0);
316                /* Rollback sync time to cause a sync on lock release... */
317                (void) BTSetLastSync(VTOF(vp), tv.tv_sec - (kMaxSecsForFsync + 1));
318
319                buf_clearflags(bp, B_LOCKED);
320                buf_bawrite(bp);
321            }
322
323            /* Don't let anyone else try to use this bp, it's been consumed */
324			blockPtr->blockHeader = NULL;
325
326        } else {
327			// check if we had previously called journal_modify_block_start()
328			// on this block and if so, abort it (which will call buf_brelse()).
329			if (hfsmp->jnl && blockPtr->isModified) {
330				// XXXdbg - I don't want to call modify_block_abort()
331				//          because I think it may be screwing up the
332				//          journal and blowing away a block that has
333				//          valid data in it.
334				//
335				//    journal_modify_block_abort(hfsmp->jnl, bp);
336				//panic("hfs: releaseblock called for 0x%x but mod_block_start previously called.\n", bp);
337				btree_journal_modify_block_end(hfsmp, bp);
338				blockPtr->isModified = 0;
339			} else {
340				buf_brelse(bp);	/* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */
341			}
342
343			/* Don't let anyone else try to use this bp, it's been consumed */
344			blockPtr->blockHeader = NULL;
345        }
346    }
347
348exit:
349    return (retval);
350}
351
352
353OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF)
354{
355#pragma unused (maxEOF)
356
357	OSStatus	retval = 0, ret = 0;
358	int64_t		actualBytesAdded, origSize;
359	u_int64_t	bytesToAdd;
360	u_int32_t	startAllocation;
361	u_int32_t	fileblocks;
362	BTreeInfoRec 	btInfo;
363	ExtendedVCB	*vcb;
364	FCB		*filePtr;
365    	struct proc 	*p = NULL;
366	int64_t 	trim = 0;
367	int  		lockflags = 0;
368
369	filePtr = GetFileControlBlock(vp);
370
371	if ( (off_t)minEOF > filePtr->fcbEOF )
372	{
373		bytesToAdd = minEOF - filePtr->fcbEOF;
374
375		if (bytesToAdd < filePtr->ff_clumpsize)
376			bytesToAdd = filePtr->ff_clumpsize;		//XXX why not always be a mutiple of clump size?
377	}
378	else
379	{
380		return -1;
381	}
382
383	vcb = VTOVCB(vp);
384
385	/*
386	 * The Extents B-tree can't have overflow extents. ExtendFileC will
387	 * return an error if an attempt is made to extend the Extents B-tree
388	 * when the resident extents are exhausted.
389	 */
390
391	/* Protect allocation bitmap and extents overflow file. */
392	lockflags = SFL_BITMAP;
393	if (VTOC(vp)->c_fileid != kHFSExtentsFileID)
394		lockflags |= SFL_EXTENTS;
395	lockflags = hfs_systemfile_lock(vcb, lockflags, HFS_EXCLUSIVE_LOCK);
396
397	(void) BTGetInformation(filePtr, 0, &btInfo);
398
399#if 0  // XXXdbg
400	/*
401	 * The b-tree code expects nodes to be contiguous. So when
402	 * the allocation block size is less than the b-tree node
403	 * size, we need to force disk allocations to be contiguous.
404	 */
405	if (vcb->blockSize >= btInfo.nodeSize) {
406		extendFlags = 0;
407	} else {
408		/* Ensure that all b-tree nodes are contiguous on disk */
409		extendFlags = kEFContigMask;
410	}
411#endif
412
413	origSize = filePtr->fcbEOF;
414	fileblocks = filePtr->ff_blocks;
415	startAllocation = vcb->nextAllocation;
416
417	// loop trying to get a contiguous chunk that's an integer multiple
418	// of the btree node size.  if we can't get a contiguous chunk that
419	// is at least the node size then we break out of the loop and let
420	// the error propagate back up.
421	while((off_t)bytesToAdd >= btInfo.nodeSize) {
422	    do {
423		retval = ExtendFileC(vcb, filePtr, bytesToAdd, 0,
424		                     kEFContigMask | kEFMetadataMask | kEFNoClumpMask,
425		                     (int64_t *)&actualBytesAdded);
426		if (retval == dskFulErr && actualBytesAdded == 0) {
427		    bytesToAdd >>= 1;
428		    if (bytesToAdd < btInfo.nodeSize) {
429			break;
430		    } else if ((bytesToAdd % btInfo.nodeSize) != 0) {
431			// make sure it's an integer multiple of the nodeSize
432			bytesToAdd -= (bytesToAdd % btInfo.nodeSize);
433		    }
434		}
435	    } while (retval == dskFulErr && actualBytesAdded == 0);
436
437	    if (retval == dskFulErr && actualBytesAdded == 0 && bytesToAdd <= btInfo.nodeSize) {
438		break;
439	    }
440
441	    filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize;
442	    bytesToAdd = minEOF - filePtr->fcbEOF;
443	}
444
445	/*
446	 * If a new extent was added then move the roving allocator
447	 * reference forward by the current b-tree file size so
448	 * there's plenty of room to grow.
449	 */
450	if ((retval == 0) &&
451	    ((VCBTOHFS(vcb)->hfs_flags & HFS_METADATA_ZONE) == 0) &&
452	    (vcb->nextAllocation > startAllocation) &&
453	    ((vcb->nextAllocation + fileblocks) < vcb->allocLimit)) {
454		HFS_UPDATE_NEXT_ALLOCATION(vcb, vcb->nextAllocation + fileblocks);
455	}
456
457	filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize;
458
459	// XXXdbg ExtendFileC() could have returned an error even though
460	// it grew the file to be big enough for our needs.  If this is
461	// the case, we don't care about retval so we blow it away.
462	//
463	if (filePtr->fcbEOF >= (off_t)minEOF && retval != 0) {
464		retval = 0;
465	}
466
467	// XXXdbg if the file grew but isn't large enough or isn't an
468	// even multiple of the nodeSize then trim things back.  if
469	// the file isn't large enough we trim back to the original
470	// size.  otherwise we trim back to be an even multiple of the
471	// btree node size.
472	//
473	if ((filePtr->fcbEOF < (off_t)minEOF) || ((filePtr->fcbEOF - origSize) % btInfo.nodeSize) != 0) {
474
475		if (filePtr->fcbEOF < (off_t)minEOF) {
476			retval = dskFulErr;
477
478			if (filePtr->fcbEOF < origSize) {
479				panic("hfs: btree file eof %lld less than orig size %lld!\n",
480					  filePtr->fcbEOF, origSize);
481			}
482
483			trim = filePtr->fcbEOF - origSize;
484		} else {
485			trim = ((filePtr->fcbEOF - origSize) % btInfo.nodeSize);
486		}
487
488		ret = TruncateFileC(vcb, filePtr, filePtr->fcbEOF - trim, 0, 0, FTOC(filePtr)->c_fileid, 0);
489		filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize;
490
491		// XXXdbg - panic if the file didn't get trimmed back properly
492		if ((filePtr->fcbEOF % btInfo.nodeSize) != 0) {
493			panic("hfs: truncate file didn't! fcbEOF %lld nsize %d fcb %p\n",
494				  filePtr->fcbEOF, btInfo.nodeSize, filePtr);
495		}
496
497		if (ret) {
498			// XXXdbg - this probably doesn't need to be a panic()
499			panic("hfs: error truncating btree files (sz 0x%llx, trim %lld, ret %ld)\n",
500			      filePtr->fcbEOF, trim, (long)ret);
501			goto out;
502		}
503	}
504
505	if(VTOC(vp)->c_fileid != kHFSExtentsFileID) {
506		/*
507		 * Get any extents overflow b-tree changes to disk ASAP!
508		 */
509		(void) BTFlushPath(VTOF(vcb->extentsRefNum));
510		(void) hfs_fsync(vcb->extentsRefNum, MNT_WAIT, 0, p);
511	}
512	hfs_systemfile_unlock(vcb, lockflags);
513	lockflags = 0;
514
515	if ((filePtr->fcbEOF % btInfo.nodeSize) != 0) {
516		panic("hfs: extendbtree: fcb %p has eof 0x%llx not a multiple of 0x%x (trim %llx)\n",
517			  filePtr, filePtr->fcbEOF, btInfo.nodeSize, trim);
518	}
519
520	/*
521	 * Update the Alternate MDB or Alternate VolumeHeader
522	 */
523	if ((VTOC(vp)->c_fileid == kHFSExtentsFileID)	||
524	    (VTOC(vp)->c_fileid == kHFSCatalogFileID)	||
525	    (VTOC(vp)->c_fileid == kHFSAttributesFileID)
526	   ) {
527		VTOC(vp)->c_flag |= C_MODIFIED;
528		MarkVCBDirty( vcb );
529		ret = hfs_flushvolumeheader(VCBTOHFS(vcb), MNT_WAIT, HFS_ALTFLUSH);
530	} else {
531		VTOC(vp)->c_touch_chgtime = TRUE;
532		VTOC(vp)->c_touch_modtime = TRUE;
533		(void) hfs_update(vp, TRUE);
534	}
535
536	ret = ClearBTNodes(vp, btInfo.nodeSize, origSize, (filePtr->fcbEOF - origSize));
537out:
538	if (retval == 0)
539		retval = ret;
540
541	if (lockflags)
542		hfs_systemfile_unlock(vcb, lockflags);
543
544	return retval;
545}
546
547
548/*
549 * Clear out (zero) new b-tree nodes on disk.
550 */
551static int
552ClearBTNodes(struct vnode *vp, long blksize, off_t offset, off_t amount)
553{
554	struct hfsmount *hfsmp = VTOHFS(vp);
555	struct buf *bp = NULL;
556	daddr64_t blk;
557	daddr64_t blkcnt;
558
559	blk = offset / blksize;
560	blkcnt = amount / blksize;
561
562	while (blkcnt > 0) {
563		bp = buf_getblk(vp, blk, blksize, 0, 0, BLK_META);
564		if (bp == NULL)
565			continue;
566
567        // XXXdbg
568		if (hfsmp->jnl) {
569			// XXXdbg -- skipping this for now since it makes a transaction
570			//           become *way* too large
571		    //journal_modify_block_start(hfsmp->jnl, bp);
572		}
573		bzero((char *)buf_dataptr(bp), blksize);
574
575		buf_markaged(bp);
576
577        // XXXdbg
578		if (hfsmp->jnl) {
579			// XXXdbg -- skipping this for now since it makes a transaction
580			//           become *way* too large
581			//journal_modify_block_end(hfsmp->jnl, bp);
582
583			// XXXdbg - remove this once we decide what to do with the
584			//          writes to the journal
585			if ((blk % 32) == 0)
586			    VNOP_BWRITE(bp);
587			else
588			    buf_bawrite(bp);
589		} else {
590			/* wait/yield every 32 blocks so we don't hog all the buffers */
591			if ((blk % 32) == 0)
592				VNOP_BWRITE(bp);
593			else
594				buf_bawrite(bp);
595		}
596		--blkcnt;
597		++blk;
598	}
599
600	return (0);
601}
602
603
604extern char  hfs_attrname[];
605
606/*
607 * Create an HFS+ Attribute B-tree File.
608 *
609 * No global resources should be held.
610 */
611int
612hfs_create_attr_btree(struct hfsmount *hfsmp, u_int32_t nodesize, u_int32_t nodecnt)
613{
614	struct vnode* vp = NULLVP;
615	struct cat_desc cndesc;
616	struct cat_attr cnattr;
617	struct cat_fork cfork;
618	BlockDescriptor blkdesc;
619	BTNodeDescriptor  *ndp;
620	BTHeaderRec  *bthp;
621	BTreeControlBlockPtr btcb = NULL;
622	struct buf *bp = NULL;
623	void * buffer;
624	u_int8_t *bitmap;
625	u_int16_t *index;
626	u_int32_t node_num, num_map_nodes;
627	u_int32_t bytes_per_map_record;
628	u_int32_t temp;
629	u_int16_t  offset;
630	int intrans = 0;
631	int result;
632	int newvnode_flags = 0;
633
634again:
635	/*
636	 * Serialize creation using HFS_CREATING_BTREE flag.
637	 */
638	lck_mtx_lock(&hfsmp->hfs_mutex);
639	if (hfsmp->hfs_flags & HFS_CREATING_BTREE) {
640			/* Someone else beat us, wait for them to finish. */
641			(void) msleep(hfsmp->hfs_attribute_cp, &hfsmp->hfs_mutex,
642			              PDROP | PINOD, "hfs_create_attr_btree", 0);
643			if (hfsmp->hfs_attribute_vp) {
644				return (0);
645			}
646			goto again;
647	}
648	hfsmp->hfs_flags |= HFS_CREATING_BTREE;
649	lck_mtx_unlock(&hfsmp->hfs_mutex);
650
651	/* Check if were out of usable disk space. */
652	if ((hfs_freeblks(hfsmp, 1) == 0)) {
653		result = ENOSPC;
654		goto exit;
655	}
656
657	/*
658	 * Set up Attribute B-tree vnode
659	 * (this must be done before we start a transaction
660	 *  or take any system file locks)
661	 */
662	bzero(&cndesc, sizeof(cndesc));
663	cndesc.cd_parentcnid = kHFSRootParentID;
664	cndesc.cd_flags |= CD_ISMETA;
665	cndesc.cd_nameptr = (const u_int8_t *)hfs_attrname;
666	cndesc.cd_namelen = strlen(hfs_attrname);
667	cndesc.cd_cnid = kHFSAttributesFileID;
668
669	bzero(&cnattr, sizeof(cnattr));
670	cnattr.ca_linkcount = 1;
671	cnattr.ca_mode = S_IFREG;
672	cnattr.ca_fileid = cndesc.cd_cnid;
673
674	bzero(&cfork, sizeof(cfork));
675	cfork.cf_clump = nodesize * nodecnt;
676
677	result = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr,
678							 &cfork, &vp, &newvnode_flags);
679	if (result) {
680		goto exit;
681	}
682	/*
683	 * Set up Attribute B-tree control block
684	 */
685	MALLOC(btcb, BTreeControlBlock *, sizeof(BTreeControlBlock), M_TEMP, M_WAITOK);
686        bzero(btcb, sizeof(BTreeControlBlock));
687
688	btcb->nodeSize          = nodesize;
689	btcb->maxKeyLength      = kHFSPlusAttrKeyMaximumLength;
690	btcb->btreeType         = 0xFF;
691	btcb->attributes        = kBTVariableIndexKeysMask | kBTBigKeysMask;
692	btcb->version           = kBTreeVersion;
693	btcb->writeCount        = 1;
694	btcb->flags             = 0;  /* kBTHeaderDirty */
695	btcb->fileRefNum        = vp;
696	btcb->getBlockProc      = GetBTreeBlock;
697	btcb->releaseBlockProc  = ReleaseBTreeBlock;
698	btcb->setEndOfForkProc  = ExtendBTreeFile;
699	btcb->keyCompareProc    = (KeyCompareProcPtr)hfs_attrkeycompare;
700	VTOF(vp)->fcbBTCBPtr    = btcb;
701
702	/*
703	 * Allocate some space
704	 */
705	if (hfs_start_transaction(hfsmp) != 0) {
706		result = EINVAL;
707		goto exit;
708	}
709	intrans = 1;
710
711	/* Note ExtendBTreeFile will acquire the necessary system file locks. */
712	result = ExtendBTreeFile(vp, nodesize, cfork.cf_clump);
713	if (result)
714		goto exit;
715
716	btcb->totalNodes = VTOF(vp)->ff_size / nodesize;
717
718	/*
719	 * Figure out how many map nodes we'll need.
720	 *
721	 * bytes_per_map_record = the number of bytes in the map record of a
722	 * map node.  Since that is the only record in the node, it is the size
723	 * of the node minus the node descriptor at the start, and two record
724	 * offsets at the end of the node.  The "- 2" is to round the size down
725	 * to a multiple of 4 bytes (since sizeof(BTNodeDescriptor) is not a
726	 * multiple of 4).
727	 *
728	 * The value "temp" here is the number of *bits* in the map record of
729	 * the header node.
730	 */
731	bytes_per_map_record = nodesize - sizeof(BTNodeDescriptor) - 2*sizeof(u_int16_t) - 2;
732	temp = 8 * (nodesize - sizeof(BTNodeDescriptor)
733			- sizeof(BTHeaderRec)
734			- kBTreeHeaderUserBytes
735			- 4 * sizeof(u_int16_t));
736	if (btcb->totalNodes > temp) {
737		num_map_nodes = howmany(btcb->totalNodes - temp, bytes_per_map_record * 8);
738	}
739	else {
740		num_map_nodes = 0;
741	}
742
743	btcb->freeNodes = btcb->totalNodes - 1 - num_map_nodes;
744
745	/*
746	 * Initialize the b-tree header on disk
747	 */
748	bp = buf_getblk(vp, 0, nodesize, 0, 0, BLK_META);
749	if (bp == NULL) {
750		result = EIO;
751		goto exit;
752	}
753
754	buffer = (void *)buf_dataptr(bp);
755	blkdesc.buffer = buffer;
756	blkdesc.blockHeader = (void *)bp;
757	blkdesc.blockReadFromDisk = 0;
758	blkdesc.isModified = 0;
759
760	ModifyBlockStart(vp, &blkdesc);
761
762	if (buf_size(bp) != nodesize)
763		panic("hfs_create_attr_btree: bad buffer size (%d)\n", buf_size(bp));
764
765	bzero(buffer, nodesize);
766	index = (u_int16_t *)buffer;
767
768	/* FILL IN THE NODE DESCRIPTOR:  */
769	ndp = (BTNodeDescriptor *)buffer;
770	if (num_map_nodes != 0)
771		ndp->fLink = 1;
772	ndp->kind = kBTHeaderNode;
773	ndp->numRecords = 3;
774	offset = sizeof(BTNodeDescriptor);
775	index[(nodesize / 2) - 1] = offset;
776
777	/* FILL IN THE HEADER RECORD:  */
778	bthp = (BTHeaderRec *)((u_int8_t *)buffer + offset);
779	bthp->nodeSize     = nodesize;
780	bthp->totalNodes   = btcb->totalNodes;
781	bthp->freeNodes    = btcb->freeNodes;
782	bthp->clumpSize    = cfork.cf_clump;
783	bthp->btreeType    = 0xFF;
784	bthp->attributes   = kBTVariableIndexKeysMask | kBTBigKeysMask;
785	bthp->maxKeyLength = kHFSPlusAttrKeyMaximumLength;
786	bthp->keyCompareType = kHFSBinaryCompare;
787	offset += sizeof(BTHeaderRec);
788	index[(nodesize / 2) - 2] = offset;
789
790	/* FILL IN THE USER RECORD:  */
791	offset += kBTreeHeaderUserBytes;
792	index[(nodesize / 2) - 3] = offset;
793
794	/* Mark the header node and map nodes in use in the map record.
795	 *
796	 * NOTE: Assumes that the header node's map record has at least
797	 * (num_map_nodes + 1) bits.
798	 */
799	bitmap = (u_int8_t *) buffer + offset;
800	temp = num_map_nodes + 1;	/* +1 for the header node */
801	while (temp >= 8) {
802		*(bitmap++) = 0xFF;
803		temp -= 8;
804	}
805	*bitmap = ~(0xFF >> temp);
806
807	offset += nodesize - sizeof(BTNodeDescriptor) - sizeof(BTHeaderRec)
808			   - kBTreeHeaderUserBytes - (4 * sizeof(int16_t));
809	index[(nodesize / 2) - 4] = offset;
810
811	if (hfsmp->jnl) {
812		result = btree_journal_modify_block_end(hfsmp, bp);
813	} else {
814		result = VNOP_BWRITE(bp);
815	}
816	if (result)
817		goto exit;
818
819	/* Create the map nodes: node numbers 1 .. num_map_nodes */
820	for (node_num=1; node_num <= num_map_nodes; ++node_num) {
821		bp = buf_getblk(vp, node_num, nodesize, 0, 0, BLK_META);
822		if (bp == NULL) {
823			result = EIO;
824			goto exit;
825		}
826		buffer = (void *)buf_dataptr(bp);
827		blkdesc.buffer = buffer;
828		blkdesc.blockHeader = (void *)bp;
829		blkdesc.blockReadFromDisk = 0;
830		blkdesc.isModified = 0;
831
832		ModifyBlockStart(vp, &blkdesc);
833
834		bzero(buffer, nodesize);
835		index = (u_int16_t *)buffer;
836
837		/* Fill in the node descriptor */
838		ndp = (BTNodeDescriptor *)buffer;
839		if (node_num != num_map_nodes)
840			ndp->fLink = node_num + 1;
841		ndp->kind = kBTMapNode;
842		ndp->numRecords = 1;
843		offset = sizeof(BTNodeDescriptor);
844		index[(nodesize / 2) - 1] = offset;
845
846
847		/* Fill in the map record's offset */
848		/* Note: We assume that the map record is all zeroes */
849		offset = sizeof(BTNodeDescriptor) + bytes_per_map_record;
850		index[(nodesize / 2) - 2] = offset;
851
852		if (hfsmp->jnl) {
853			result = btree_journal_modify_block_end(hfsmp, bp);
854		} else {
855			result = VNOP_BWRITE(bp);
856		}
857		if (result)
858			goto exit;
859	}
860
861	/* Update vp/cp for attribute btree */
862	lck_mtx_lock(&hfsmp->hfs_mutex);
863	hfsmp->hfs_attribute_cp = VTOC(vp);
864	hfsmp->hfs_attribute_vp = vp;
865	lck_mtx_unlock(&hfsmp->hfs_mutex);
866
867	(void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
868
869	if (intrans) {
870		hfs_end_transaction(hfsmp);
871		intrans = 0;
872	}
873
874	/* Initialize the vnode for virtual attribute data file */
875	result = init_attrdata_vnode(hfsmp);
876	if (result) {
877		printf("hfs_create_attr_btree: init_attrdata_vnode() error=%d\n", result);
878	}
879
880exit:
881	if (vp) {
882		hfs_unlock(VTOC(vp));
883	}
884	if (result) {
885		if (btcb) {
886			FREE (btcb, M_TEMP);
887		}
888		if (vp) {
889			vnode_put(vp);
890		}
891		/* XXX need to give back blocks ? */
892	}
893	if (intrans) {
894		hfs_end_transaction(hfsmp);
895	}
896
897	/*
898	 * All done, clear HFS_CREATING_BTREE, and wake up any sleepers.
899	 */
900	lck_mtx_lock(&hfsmp->hfs_mutex);
901	hfsmp->hfs_flags &= ~HFS_CREATING_BTREE;
902	wakeup((caddr_t)hfsmp->hfs_attribute_cp);
903	lck_mtx_unlock(&hfsmp->hfs_mutex);
904
905	return (result);
906}
907
908