1/*
2 * Copyright (c) 2000-2011 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/param.h>
30#include <sys/systm.h>
31#include <sys/buf.h>
32#include <sys/buf_internal.h>
33#include <sys/kernel.h>
34#include <sys/malloc.h>
35#include <sys/mount.h>
36#include <sys/vnode.h>
37
38
39#include "hfs.h"
40#include "hfs_cnode.h"
41#include "hfs_dbg.h"
42#include "hfs_endian.h"
43#include "hfs_btreeio.h"
44
45#include "hfscommon/headers/FileMgrInternal.h"
46#include "hfscommon/headers/BTreesPrivate.h"
47
48#define FORCESYNCBTREEWRITES 0
49
50/* From bsd/vfs/vfs_bio.c */
51extern int bdwrite_internal(struct buf *, int);
52
53static int ClearBTNodes(struct vnode *vp, int blksize, off_t offset, off_t amount);
54static int btree_journal_modify_block_end(struct hfsmount *hfsmp, struct buf *bp);
55
56void btree_swap_node(struct buf *bp, __unused void *arg);
57
58/*
59 * Return btree node size for given vnode.
60 *
61 * Returns:
62 * 	For btree vnode, returns btree node size.
63 * 	For non-btree vnodes, returns 0.
64 */
65u_int16_t get_btree_nodesize(struct vnode *vp)
66{
67	BTreeControlBlockPtr btree;
68	u_int16_t node_size = 0;
69
70	if (vnode_issystem(vp)) {
71		btree = (BTreeControlBlockPtr) VTOF(vp)->fcbBTCBPtr;
72		if (btree) {
73			node_size = btree->nodeSize;
74		}
75	}
76
77	return node_size;
78}
79
80OSStatus SetBTreeBlockSize(FileReference vp, ByteCount blockSize, __unused ItemCount minBlockCount)
81{
82	BTreeControlBlockPtr	bTreePtr;
83
84	DBG_ASSERT(vp != NULL);
85	DBG_ASSERT(blockSize >= kMinNodeSize);
86    if (blockSize > MAXBSIZE )
87        return (fsBTBadNodeSize);
88
89	bTreePtr = (BTreeControlBlockPtr)VTOF(vp)->fcbBTCBPtr;
90	bTreePtr->nodeSize = blockSize;
91
92    return (E_NONE);
93}
94
95
96OSStatus GetBTreeBlock(FileReference vp, u_int32_t blockNum, GetBlockOptions options, BlockDescriptor *block)
97{
98    OSStatus	 retval = E_NONE;
99    struct buf   *bp = NULL;
100	u_int8_t     allow_empty_node;
101
102	/* If the btree block is being read using hint, it is
103	 * fine for the swap code to find zeroed out nodes.
104	 */
105	if (options & kGetBlockHint) {
106			allow_empty_node = true;
107	} else {
108			allow_empty_node = false;
109	}
110
111    if (options & kGetEmptyBlock) {
112        daddr64_t blkno;
113        off_t offset;
114
115        offset = (daddr64_t)blockNum * (daddr64_t)block->blockSize;
116        bp = buf_getblk(vp, (daddr64_t)blockNum, block->blockSize, 0, 0, BLK_META);
117        if (bp &&
118            VNOP_BLOCKMAP(vp, offset, block->blockSize, &blkno, NULL, NULL, 0, NULL) == 0) {
119            buf_setblkno(bp, blkno);
120        }
121    } else {
122        retval = buf_meta_bread(vp, (daddr64_t)blockNum, block->blockSize, NOCRED, &bp);
123    }
124    if (bp == NULL)
125        retval = -1;	//XXX need better error
126
127    if (retval == E_NONE) {
128        block->blockHeader = bp;
129        block->buffer = (char *)buf_dataptr(bp);
130    	block->blockNum = buf_lblkno(bp);
131        block->blockReadFromDisk = (buf_fromcache(bp) == 0);	/* not found in cache ==> came from disk */
132
133		// XXXdbg
134		block->isModified = 0;
135
136		/* Check and endian swap B-Tree node (only if it's a valid block) */
137		if (!(options & kGetEmptyBlock)) {
138
139			/* This happens when we first open the b-tree, we might not have all the node data on hand */
140			if ((((BTNodeDescriptor *)block->buffer)->kind == kBTHeaderNode) &&
141					(((BTHeaderRec *)((char *)block->buffer + 14))->nodeSize != buf_count(bp)) &&
142					(SWAP_BE16 (((BTHeaderRec *)((char *)block->buffer + 14))->nodeSize) != buf_count(bp))) {
143
144				/*
145				 * Don't swap the node descriptor, record offsets, or other records.
146				 * This record will be invalidated and re-read with the correct node
147				 * size once the B-tree control block is set up with the node size
148				 * from the header record.
149				 */
150				retval = hfs_swap_BTNode (block, vp, kSwapBTNodeHeaderRecordOnly, allow_empty_node);
151
152			} else {
153				/*
154				 * In this case, we have enough data in-hand to do basic validation
155				 * on the B-Tree node.
156				 */
157				if (block->blockReadFromDisk) {
158					/*
159					 * The node was just read from disk, so always swap/check it.
160					 * This is necessary on big endian since the test below won't trigger.
161					 */
162					retval = hfs_swap_BTNode (block, vp, kSwapBTNodeBigToHost, allow_empty_node);
163				}
164				else {
165					/*
166					 * Block wasn't read from disk; it was found in the cache.
167					 */
168					if (*((u_int16_t *)((char *)block->buffer + (block->blockSize - sizeof (u_int16_t)))) == 0x0e00) {
169						/*
170						 * The node was left in the cache in non-native order, so swap it.
171						 * This only happens on little endian, after the node is written
172						 * back to disk.
173						 */
174						retval = hfs_swap_BTNode (block, vp, kSwapBTNodeBigToHost, allow_empty_node);
175					}
176					else if (*((u_int16_t *)((char *)block->buffer + (block->blockSize - sizeof (u_int16_t)))) == 0x000e) {
177						/*
178						 * The node was in-cache in native-endianness.  We don't need to do
179						 * anything here, because the node is ready to use.  Set retval == 0.
180						 */
181						retval = 0;
182					}
183					/*
184					 * If the node doesn't have hex 14 (0xe) in the last two bytes of the buffer,
185					 * it doesn't necessarily mean that this is a bad node.  Zeroed nodes that are
186					 * marked as unused in the b-tree map node would be OK and not have valid content.
187					 */
188				}
189			}
190
191			/*
192			 * If we got an error, then the node is only partially swapped.
193			 * We mark the buffer invalid so that the next attempt to get the
194			 * node will read it and attempt to swap again, and will notice
195			 * the error again.  If we didn't do this, the next attempt to get
196			 * the node might use the partially swapped node as-is.
197			 */
198			if (retval)
199				buf_markinvalid(bp);
200		}
201	}
202
203    if (retval) {
204    	if (bp)
205			buf_brelse(bp);
206        block->blockHeader = NULL;
207        block->buffer = NULL;
208    }
209
210    return (retval);
211}
212
213
214void ModifyBlockStart(FileReference vp, BlockDescPtr blockPtr)
215{
216	struct hfsmount	*hfsmp = VTOHFS(vp);
217    struct buf *bp = NULL;
218
219	if (hfsmp->jnl == NULL) {
220		return;
221	}
222
223    bp = (struct buf *) blockPtr->blockHeader;
224    if (bp == NULL) {
225		panic("hfs: ModifyBlockStart: null bp  for blockdescptr %p?!?\n", blockPtr);
226		return;
227    }
228
229	journal_modify_block_start(hfsmp->jnl, bp);
230	blockPtr->isModified = 1;
231}
232
233void
234btree_swap_node(struct buf *bp, __unused void *arg)
235{
236    //	struct hfsmount *hfsmp = (struct hfsmount *)arg;
237	int retval;
238    struct vnode *vp = buf_vnode(bp);
239    BlockDescriptor block;
240
241    /* Prepare the block pointer */
242    block.blockHeader = bp;
243    block.buffer = (char *)buf_dataptr(bp);
244    block.blockNum = buf_lblkno(bp);
245    /* not found in cache ==> came from disk */
246    block.blockReadFromDisk = (buf_fromcache(bp) == 0);
247    block.blockSize = buf_count(bp);
248
249    /* Swap the data now that this node is ready to go to disk.
250     * We allow swapping of zeroed out nodes here because we might
251     * be writing node whose last record just got deleted.
252     */
253    retval = hfs_swap_BTNode (&block, vp, kSwapBTNodeHostToBig, true);
254    if (retval)
255    	panic("hfs: btree_swap_node: about to write corrupt node!\n");
256}
257
258
259static int
260btree_journal_modify_block_end(struct hfsmount *hfsmp, struct buf *bp)
261{
262    return journal_modify_block_end(hfsmp->jnl, bp, btree_swap_node, hfsmp);
263}
264
265
266OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, ReleaseBlockOptions options)
267{
268    struct hfsmount	*hfsmp = VTOHFS(vp);
269    OSStatus	retval = E_NONE;
270    struct buf *bp = NULL;
271
272    bp = (struct buf *) blockPtr->blockHeader;
273
274    if (bp == NULL) {
275        retval = -1;
276        goto exit;
277    }
278
279    if (options & kTrashBlock) {
280                buf_markinvalid(bp);
281
282		if (hfsmp->jnl && (buf_flags(bp) & B_LOCKED)) {
283			journal_kill_block(hfsmp->jnl, bp);
284		} else {
285			buf_brelse(bp);	/* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */
286		}
287
288		/* Don't let anyone else try to use this bp, it's been consumed */
289		blockPtr->blockHeader = NULL;
290
291    } else {
292        if (options & kForceWriteBlock) {
293			if (hfsmp->jnl) {
294				if (blockPtr->isModified == 0) {
295					panic("hfs: releaseblock: modified is 0 but forcewrite set! bp %p\n", bp);
296				}
297
298				retval = btree_journal_modify_block_end(hfsmp, bp);
299				blockPtr->isModified = 0;
300			} else {
301				retval = VNOP_BWRITE(bp);
302			}
303
304			/* Don't let anyone else try to use this bp, it's been consumed */
305			blockPtr->blockHeader = NULL;
306
307        } else if (options & kMarkBlockDirty) {
308			struct timeval tv;
309			microuptime(&tv);
310            if ((options & kLockTransaction) && hfsmp->jnl == NULL) {
311                /*
312                 *
313                 * Set the B_LOCKED flag and unlock the buffer, causing buf_brelse to move
314                 * the buffer onto the LOCKED free list.  This is necessary, otherwise
315                 * getnewbuf() would try to reclaim the buffers using buf_bawrite, which
316                 * isn't going to work.
317                 *
318                 */
319                /* Don't hog all the buffers... */
320                if (count_lock_queue() > kMaxLockedMetaBuffers) {
321                     hfs_btsync(vp, HFS_SYNCTRANS);
322                     /* Rollback sync time to cause a sync on lock release... */
323                     (void) BTSetLastSync(VTOF(vp), tv.tv_sec - (kMaxSecsForFsync + 1));
324                }
325		buf_setflags(bp, B_LOCKED);
326            }
327
328            /*
329             * Delay-write this block.
330             * If the maximum delayed buffers has been exceeded then
331             * free up some buffers and fall back to an asynchronous write.
332             */
333			if (hfsmp->jnl) {
334				if (blockPtr->isModified == 0) {
335					panic("hfs: releaseblock: modified is 0 but markdirty set! bp %p\n", bp);
336				}
337				retval = btree_journal_modify_block_end(hfsmp, bp);
338				blockPtr->isModified = 0;
339			} else if (bdwrite_internal(bp, 1) != 0) {
340                hfs_btsync(vp, 0);
341                /* Rollback sync time to cause a sync on lock release... */
342                (void) BTSetLastSync(VTOF(vp), tv.tv_sec - (kMaxSecsForFsync + 1));
343
344                buf_clearflags(bp, B_LOCKED);
345                buf_bawrite(bp);
346            }
347
348            /* Don't let anyone else try to use this bp, it's been consumed */
349			blockPtr->blockHeader = NULL;
350
351        } else {
352			// check if we had previously called journal_modify_block_start()
353			// on this block and if so, abort it (which will call buf_brelse()).
354			if (hfsmp->jnl && blockPtr->isModified) {
355				// XXXdbg - I don't want to call modify_block_abort()
356				//          because I think it may be screwing up the
357				//          journal and blowing away a block that has
358				//          valid data in it.
359				//
360				//    journal_modify_block_abort(hfsmp->jnl, bp);
361				//panic("hfs: releaseblock called for 0x%x but mod_block_start previously called.\n", bp);
362				btree_journal_modify_block_end(hfsmp, bp);
363				blockPtr->isModified = 0;
364			} else {
365				buf_brelse(bp);	/* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */
366			}
367
368			/* Don't let anyone else try to use this bp, it's been consumed */
369			blockPtr->blockHeader = NULL;
370        }
371    }
372
373exit:
374    return (retval);
375}
376
377
378OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF)
379{
380#pragma unused (maxEOF)
381
382	OSStatus	retval = 0, ret = 0;
383	int64_t		actualBytesAdded, origSize;
384	u_int64_t	bytesToAdd;
385	u_int32_t	startAllocation;
386	u_int32_t	fileblocks;
387	BTreeInfoRec 	btInfo;
388	ExtendedVCB	*vcb;
389	FCB		*filePtr;
390    	struct proc 	*p = NULL;
391	int64_t 	trim = 0;
392	int  		lockflags = 0;
393
394	filePtr = GetFileControlBlock(vp);
395
396	if ( (off_t)minEOF > filePtr->fcbEOF )
397	{
398		bytesToAdd = minEOF - filePtr->fcbEOF;
399
400		if (bytesToAdd < filePtr->ff_clumpsize)
401			bytesToAdd = filePtr->ff_clumpsize;		//XXX why not always be a mutiple of clump size?
402	}
403	else
404	{
405		return -1;
406	}
407
408	vcb = VTOVCB(vp);
409
410	/*
411	 * The Extents B-tree can't have overflow extents. ExtendFileC will
412	 * return an error if an attempt is made to extend the Extents B-tree
413	 * when the resident extents are exhausted.
414	 */
415
416	/* Protect allocation bitmap and extents overflow file. */
417	lockflags = SFL_BITMAP;
418	if (VTOC(vp)->c_fileid != kHFSExtentsFileID)
419		lockflags |= SFL_EXTENTS;
420	lockflags = hfs_systemfile_lock(vcb, lockflags, HFS_EXCLUSIVE_LOCK);
421
422	(void) BTGetInformation(filePtr, 0, &btInfo);
423
424#if 0  // XXXdbg
425	/*
426	 * The b-tree code expects nodes to be contiguous. So when
427	 * the allocation block size is less than the b-tree node
428	 * size, we need to force disk allocations to be contiguous.
429	 */
430	if (vcb->blockSize >= btInfo.nodeSize) {
431		extendFlags = 0;
432	} else {
433		/* Ensure that all b-tree nodes are contiguous on disk */
434		extendFlags = kEFContigMask;
435	}
436#endif
437
438	origSize = filePtr->fcbEOF;
439	fileblocks = filePtr->ff_blocks;
440	startAllocation = vcb->nextAllocation;
441
442	// loop trying to get a contiguous chunk that's an integer multiple
443	// of the btree node size.  if we can't get a contiguous chunk that
444	// is at least the node size then we break out of the loop and let
445	// the error propagate back up.
446	while((off_t)bytesToAdd >= btInfo.nodeSize) {
447	    do {
448		retval = ExtendFileC(vcb, filePtr, bytesToAdd, 0,
449		                     kEFContigMask | kEFMetadataMask | kEFNoClumpMask,
450		                     (int64_t *)&actualBytesAdded);
451		if (retval == dskFulErr && actualBytesAdded == 0) {
452		    bytesToAdd >>= 1;
453		    if (bytesToAdd < btInfo.nodeSize) {
454			break;
455		    } else if ((bytesToAdd % btInfo.nodeSize) != 0) {
456			// make sure it's an integer multiple of the nodeSize
457			bytesToAdd -= (bytesToAdd % btInfo.nodeSize);
458		    }
459		}
460	    } while (retval == dskFulErr && actualBytesAdded == 0);
461
462	    if (retval == dskFulErr && actualBytesAdded == 0 && bytesToAdd <= btInfo.nodeSize) {
463		break;
464	    }
465
466	    filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize;
467	    bytesToAdd = minEOF - filePtr->fcbEOF;
468	}
469
470	/*
471	 * If a new extent was added then move the roving allocator
472	 * reference forward by the current b-tree file size so
473	 * there's plenty of room to grow.
474	 */
475	if ((retval == 0) &&
476	    ((VCBTOHFS(vcb)->hfs_flags & HFS_METADATA_ZONE) == 0) &&
477	    (vcb->nextAllocation > startAllocation) &&
478	    ((vcb->nextAllocation + fileblocks) < vcb->allocLimit)) {
479		HFS_UPDATE_NEXT_ALLOCATION(vcb, vcb->nextAllocation + fileblocks);
480	}
481
482	filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize;
483
484	// XXXdbg ExtendFileC() could have returned an error even though
485	// it grew the file to be big enough for our needs.  If this is
486	// the case, we don't care about retval so we blow it away.
487	//
488	if (filePtr->fcbEOF >= (off_t)minEOF && retval != 0) {
489		retval = 0;
490	}
491
492	// XXXdbg if the file grew but isn't large enough or isn't an
493	// even multiple of the nodeSize then trim things back.  if
494	// the file isn't large enough we trim back to the original
495	// size.  otherwise we trim back to be an even multiple of the
496	// btree node size.
497	//
498	if ((filePtr->fcbEOF < (off_t)minEOF) || ((filePtr->fcbEOF - origSize) % btInfo.nodeSize) != 0) {
499
500		if (filePtr->fcbEOF < (off_t)minEOF) {
501			retval = dskFulErr;
502
503			if (filePtr->fcbEOF < origSize) {
504				panic("hfs: btree file eof %lld less than orig size %lld!\n",
505					  filePtr->fcbEOF, origSize);
506			}
507
508			trim = filePtr->fcbEOF - origSize;
509		} else {
510			trim = ((filePtr->fcbEOF - origSize) % btInfo.nodeSize);
511		}
512
513		ret = TruncateFileC(vcb, filePtr, filePtr->fcbEOF - trim, 0, 0, FTOC(filePtr)->c_fileid, 0);
514		filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize;
515
516		// XXXdbg - panic if the file didn't get trimmed back properly
517		if ((filePtr->fcbEOF % btInfo.nodeSize) != 0) {
518			panic("hfs: truncate file didn't! fcbEOF %lld nsize %d fcb %p\n",
519				  filePtr->fcbEOF, btInfo.nodeSize, filePtr);
520		}
521
522		if (ret) {
523			// XXXdbg - this probably doesn't need to be a panic()
524			panic("hfs: error truncating btree files (sz 0x%llx, trim %lld, ret %ld)\n",
525			      filePtr->fcbEOF, trim, (long)ret);
526			goto out;
527		}
528	}
529
530	if(VTOC(vp)->c_fileid != kHFSExtentsFileID) {
531		/*
532		 * Get any extents overflow b-tree changes to disk ASAP!
533		 */
534		(void) BTFlushPath(VTOF(vcb->extentsRefNum));
535		(void) hfs_fsync(vcb->extentsRefNum, MNT_WAIT, 0, p);
536	}
537	hfs_systemfile_unlock(vcb, lockflags);
538	lockflags = 0;
539
540	if ((filePtr->fcbEOF % btInfo.nodeSize) != 0) {
541		panic("hfs: extendbtree: fcb %p has eof 0x%llx not a multiple of 0x%x (trim %llx)\n",
542			  filePtr, filePtr->fcbEOF, btInfo.nodeSize, trim);
543	}
544
545	/*
546	 * Update the Alternate MDB or Alternate VolumeHeader
547	 */
548	if ((VTOC(vp)->c_fileid == kHFSExtentsFileID)	||
549	    (VTOC(vp)->c_fileid == kHFSCatalogFileID)	||
550	    (VTOC(vp)->c_fileid == kHFSAttributesFileID)
551	   ) {
552		VTOC(vp)->c_flag |= C_MODIFIED;
553		MarkVCBDirty( vcb );
554		ret = hfs_flushvolumeheader(VCBTOHFS(vcb), MNT_WAIT, HFS_ALTFLUSH);
555	} else {
556		VTOC(vp)->c_touch_chgtime = TRUE;
557		VTOC(vp)->c_touch_modtime = TRUE;
558		(void) hfs_update(vp, TRUE);
559	}
560
561	ret = ClearBTNodes(vp, btInfo.nodeSize, origSize, (filePtr->fcbEOF - origSize));
562out:
563	if (retval == 0)
564		retval = ret;
565
566	if (lockflags)
567		hfs_systemfile_unlock(vcb, lockflags);
568
569	return retval;
570}
571
572
573/*
574 * Clear out (zero) new b-tree nodes on disk.
575 */
576static int
577ClearBTNodes(struct vnode *vp, int blksize, off_t offset, off_t amount)
578{
579	struct hfsmount *hfsmp = VTOHFS(vp);
580	struct buf *bp = NULL;
581	daddr64_t blk;
582	daddr64_t blkcnt;
583
584	blk = offset / blksize;
585	blkcnt = amount / blksize;
586
587	while (blkcnt > 0) {
588		bp = buf_getblk(vp, blk, blksize, 0, 0, BLK_META);
589		if (bp == NULL)
590			continue;
591
592        // XXXdbg
593		if (hfsmp->jnl) {
594			// XXXdbg -- skipping this for now since it makes a transaction
595			//           become *way* too large
596		    //journal_modify_block_start(hfsmp->jnl, bp);
597		}
598		bzero((char *)buf_dataptr(bp), blksize);
599
600		buf_markaged(bp);
601
602        // XXXdbg
603		if (hfsmp->jnl) {
604			// XXXdbg -- skipping this for now since it makes a transaction
605			//           become *way* too large
606			//journal_modify_block_end(hfsmp->jnl, bp);
607
608			// XXXdbg - remove this once we decide what to do with the
609			//          writes to the journal
610			if ((blk % 32) == 0)
611			    VNOP_BWRITE(bp);
612			else
613			    buf_bawrite(bp);
614		} else {
615			/* wait/yield every 32 blocks so we don't hog all the buffers */
616			if ((blk % 32) == 0)
617				VNOP_BWRITE(bp);
618			else
619				buf_bawrite(bp);
620		}
621		--blkcnt;
622		++blk;
623	}
624
625	return (0);
626}
627
628
629extern char  hfs_attrname[];
630
631/*
632 * Create an HFS+ Attribute B-tree File.
633 *
634 * No global resources should be held.
635 */
636int
637hfs_create_attr_btree(struct hfsmount *hfsmp, u_int32_t nodesize, u_int32_t nodecnt)
638{
639	struct vnode* vp = NULLVP;
640	struct cat_desc cndesc;
641	struct cat_attr cnattr;
642	struct cat_fork cfork;
643	BlockDescriptor blkdesc;
644	BTNodeDescriptor  *ndp;
645	BTHeaderRec  *bthp;
646	BTreeControlBlockPtr btcb = NULL;
647	struct buf *bp = NULL;
648	void * buffer;
649	u_int8_t *bitmap;
650	u_int16_t *index;
651	u_int32_t node_num, num_map_nodes;
652	u_int32_t bytes_per_map_record;
653	u_int32_t temp;
654	u_int16_t  offset;
655	int intrans = 0;
656	int result;
657	int newvnode_flags = 0;
658
659again:
660	/*
661	 * Serialize creation using HFS_CREATING_BTREE flag.
662	 */
663	hfs_lock_mount (hfsmp);
664	if (hfsmp->hfs_flags & HFS_CREATING_BTREE) {
665			/* Someone else beat us, wait for them to finish. */
666			(void) msleep(&hfsmp->hfs_attribute_cp, &hfsmp->hfs_mutex,
667			              PDROP | PINOD, "hfs_create_attr_btree", 0);
668			if (hfsmp->hfs_attribute_vp) {
669				return (0);
670			}
671			goto again;
672	}
673	hfsmp->hfs_flags |= HFS_CREATING_BTREE;
674	hfs_unlock_mount (hfsmp);
675
676	/* Check if were out of usable disk space. */
677	if ((hfs_freeblks(hfsmp, 1) == 0)) {
678		result = ENOSPC;
679		goto exit;
680	}
681
682	/*
683	 * Set up Attribute B-tree vnode
684	 * (this must be done before we start a transaction
685	 *  or take any system file locks)
686	 */
687	bzero(&cndesc, sizeof(cndesc));
688	cndesc.cd_parentcnid = kHFSRootParentID;
689	cndesc.cd_flags |= CD_ISMETA;
690	cndesc.cd_nameptr = (const u_int8_t *)hfs_attrname;
691	cndesc.cd_namelen = strlen(hfs_attrname);
692	cndesc.cd_cnid = kHFSAttributesFileID;
693
694	bzero(&cnattr, sizeof(cnattr));
695	cnattr.ca_linkcount = 1;
696	cnattr.ca_mode = S_IFREG;
697	cnattr.ca_fileid = cndesc.cd_cnid;
698
699	bzero(&cfork, sizeof(cfork));
700	cfork.cf_clump = nodesize * nodecnt;
701
702	result = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr,
703							 &cfork, &vp, &newvnode_flags);
704	if (result) {
705		goto exit;
706	}
707	/*
708	 * Set up Attribute B-tree control block
709	 */
710	MALLOC(btcb, BTreeControlBlock *, sizeof(BTreeControlBlock), M_TEMP, M_WAITOK);
711        bzero(btcb, sizeof(BTreeControlBlock));
712
713	btcb->nodeSize          = nodesize;
714	btcb->maxKeyLength      = kHFSPlusAttrKeyMaximumLength;
715	btcb->btreeType         = 0xFF;
716	btcb->attributes        = kBTVariableIndexKeysMask | kBTBigKeysMask;
717	btcb->version           = kBTreeVersion;
718	btcb->writeCount        = 1;
719	btcb->flags             = 0;  /* kBTHeaderDirty */
720	btcb->fileRefNum        = vp;
721	btcb->getBlockProc      = GetBTreeBlock;
722	btcb->releaseBlockProc  = ReleaseBTreeBlock;
723	btcb->setEndOfForkProc  = ExtendBTreeFile;
724	btcb->keyCompareProc    = (KeyCompareProcPtr)hfs_attrkeycompare;
725	VTOF(vp)->fcbBTCBPtr    = btcb;
726
727	/*
728	 * Allocate some space
729	 */
730	if (hfs_start_transaction(hfsmp) != 0) {
731		result = EINVAL;
732		goto exit;
733	}
734	intrans = 1;
735
736	/* Note ExtendBTreeFile will acquire the necessary system file locks. */
737	result = ExtendBTreeFile(vp, nodesize, cfork.cf_clump);
738	if (result)
739		goto exit;
740
741	btcb->totalNodes = VTOF(vp)->ff_size / nodesize;
742
743	/*
744	 * Figure out how many map nodes we'll need.
745	 *
746	 * bytes_per_map_record = the number of bytes in the map record of a
747	 * map node.  Since that is the only record in the node, it is the size
748	 * of the node minus the node descriptor at the start, and two record
749	 * offsets at the end of the node.  The "- 2" is to round the size down
750	 * to a multiple of 4 bytes (since sizeof(BTNodeDescriptor) is not a
751	 * multiple of 4).
752	 *
753	 * The value "temp" here is the number of *bits* in the map record of
754	 * the header node.
755	 */
756	bytes_per_map_record = nodesize - sizeof(BTNodeDescriptor) - 2*sizeof(u_int16_t) - 2;
757	temp = 8 * (nodesize - sizeof(BTNodeDescriptor)
758			- sizeof(BTHeaderRec)
759			- kBTreeHeaderUserBytes
760			- 4 * sizeof(u_int16_t));
761	if (btcb->totalNodes > temp) {
762		num_map_nodes = howmany(btcb->totalNodes - temp, bytes_per_map_record * 8);
763	}
764	else {
765		num_map_nodes = 0;
766	}
767
768	btcb->freeNodes = btcb->totalNodes - 1 - num_map_nodes;
769
770	/*
771	 * Initialize the b-tree header on disk
772	 */
773	bp = buf_getblk(vp, 0, nodesize, 0, 0, BLK_META);
774	if (bp == NULL) {
775		result = EIO;
776		goto exit;
777	}
778
779	buffer = (void *)buf_dataptr(bp);
780	blkdesc.buffer = buffer;
781	blkdesc.blockHeader = (void *)bp;
782	blkdesc.blockReadFromDisk = 0;
783	blkdesc.isModified = 0;
784
785	ModifyBlockStart(vp, &blkdesc);
786
787	if (buf_size(bp) != nodesize)
788		panic("hfs_create_attr_btree: bad buffer size (%d)\n", buf_size(bp));
789
790	bzero(buffer, nodesize);
791	index = (u_int16_t *)buffer;
792
793	/* FILL IN THE NODE DESCRIPTOR:  */
794	ndp = (BTNodeDescriptor *)buffer;
795	if (num_map_nodes != 0)
796		ndp->fLink = 1;
797	ndp->kind = kBTHeaderNode;
798	ndp->numRecords = 3;
799	offset = sizeof(BTNodeDescriptor);
800	index[(nodesize / 2) - 1] = offset;
801
802	/* FILL IN THE HEADER RECORD:  */
803	bthp = (BTHeaderRec *)((u_int8_t *)buffer + offset);
804	bthp->nodeSize     = nodesize;
805	bthp->totalNodes   = btcb->totalNodes;
806	bthp->freeNodes    = btcb->freeNodes;
807	bthp->clumpSize    = cfork.cf_clump;
808	bthp->btreeType    = 0xFF;
809	bthp->attributes   = kBTVariableIndexKeysMask | kBTBigKeysMask;
810	bthp->maxKeyLength = kHFSPlusAttrKeyMaximumLength;
811	bthp->keyCompareType = kHFSBinaryCompare;
812	offset += sizeof(BTHeaderRec);
813	index[(nodesize / 2) - 2] = offset;
814
815	/* FILL IN THE USER RECORD:  */
816	offset += kBTreeHeaderUserBytes;
817	index[(nodesize / 2) - 3] = offset;
818
819	/* Mark the header node and map nodes in use in the map record.
820	 *
821	 * NOTE: Assumes that the header node's map record has at least
822	 * (num_map_nodes + 1) bits.
823	 */
824	bitmap = (u_int8_t *) buffer + offset;
825	temp = num_map_nodes + 1;	/* +1 for the header node */
826	while (temp >= 8) {
827		*(bitmap++) = 0xFF;
828		temp -= 8;
829	}
830	*bitmap = ~(0xFF >> temp);
831
832	offset += nodesize - sizeof(BTNodeDescriptor) - sizeof(BTHeaderRec)
833			   - kBTreeHeaderUserBytes - (4 * sizeof(int16_t));
834	index[(nodesize / 2) - 4] = offset;
835
836	if (hfsmp->jnl) {
837		result = btree_journal_modify_block_end(hfsmp, bp);
838	} else {
839		result = VNOP_BWRITE(bp);
840	}
841	if (result)
842		goto exit;
843
844	/* Create the map nodes: node numbers 1 .. num_map_nodes */
845	for (node_num=1; node_num <= num_map_nodes; ++node_num) {
846		bp = buf_getblk(vp, node_num, nodesize, 0, 0, BLK_META);
847		if (bp == NULL) {
848			result = EIO;
849			goto exit;
850		}
851		buffer = (void *)buf_dataptr(bp);
852		blkdesc.buffer = buffer;
853		blkdesc.blockHeader = (void *)bp;
854		blkdesc.blockReadFromDisk = 0;
855		blkdesc.isModified = 0;
856
857		ModifyBlockStart(vp, &blkdesc);
858
859		bzero(buffer, nodesize);
860		index = (u_int16_t *)buffer;
861
862		/* Fill in the node descriptor */
863		ndp = (BTNodeDescriptor *)buffer;
864		if (node_num != num_map_nodes)
865			ndp->fLink = node_num + 1;
866		ndp->kind = kBTMapNode;
867		ndp->numRecords = 1;
868		offset = sizeof(BTNodeDescriptor);
869		index[(nodesize / 2) - 1] = offset;
870
871
872		/* Fill in the map record's offset */
873		/* Note: We assume that the map record is all zeroes */
874		offset = sizeof(BTNodeDescriptor) + bytes_per_map_record;
875		index[(nodesize / 2) - 2] = offset;
876
877		if (hfsmp->jnl) {
878			result = btree_journal_modify_block_end(hfsmp, bp);
879		} else {
880			result = VNOP_BWRITE(bp);
881		}
882		if (result)
883			goto exit;
884	}
885
886	/* Update vp/cp for attribute btree */
887	hfs_lock_mount (hfsmp);
888	hfsmp->hfs_attribute_cp = VTOC(vp);
889	hfsmp->hfs_attribute_vp = vp;
890	hfs_unlock_mount (hfsmp);
891
892	(void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
893
894	if (intrans) {
895		hfs_end_transaction(hfsmp);
896		intrans = 0;
897	}
898
899	/* Initialize the vnode for virtual attribute data file */
900	result = init_attrdata_vnode(hfsmp);
901	if (result) {
902		printf("hfs_create_attr_btree: vol=%s init_attrdata_vnode() error=%d\n", hfsmp->vcbVN, result);
903	}
904
905exit:
906	if (vp) {
907		hfs_unlock(VTOC(vp));
908	}
909	if (result) {
910		if (btcb) {
911			FREE (btcb, M_TEMP);
912		}
913		if (vp) {
914			vnode_put(vp);
915		}
916		/* XXX need to give back blocks ? */
917	}
918	if (intrans) {
919		hfs_end_transaction(hfsmp);
920	}
921
922	/*
923	 * All done, clear HFS_CREATING_BTREE, and wake up any sleepers.
924	 */
925	hfs_lock_mount (hfsmp);
926	hfsmp->hfs_flags &= ~HFS_CREATING_BTREE;
927	wakeup((caddr_t)&hfsmp->hfs_attribute_cp);
928	hfs_unlock_mount (hfsmp);
929
930	return (result);
931}
932
933