1/*
2 * ntfs_mft.c - NTFS kernel mft record operations.
3 *
4 * Copyright (c) 2006-2011 Anton Altaparmakov.  All Rights Reserved.
5 * Portions Copyright (c) 2006-2011 Apple Inc.  All Rights Reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 *    this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 * 3. Neither the name of Apple Inc. ("Apple") nor the names of its
16 *    contributors may be used to endorse or promote products derived from this
17 *    software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
20 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
23 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
26 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 *
30 * ALTERNATIVELY, provided that this notice and licensing terms are retained in
31 * full, this file may be redistributed and/or modified under the terms of the
32 * GNU General Public License (GPL) Version 2, in which case the provisions of
33 * that version of the GPL will apply to you instead of the license terms
34 * above.  You can obtain a copy of the GPL Version 2 at
35 * http://developer.apple.com/opensource/licenses/gpl-2.txt.
36 */
37
38#include <sys/buf.h>
39#include <sys/errno.h>
40#include <sys/param.h>
41#include <sys/stat.h>
42#include <sys/types.h>
43#include <sys/ucred.h>
44#include <sys/ubc.h>
45#include <sys/vnode.h>
46
47#include <string.h>
48
49#include <libkern/libkern.h>
50#include <libkern/OSAtomic.h>
51#include <libkern/OSMalloc.h>
52
53#include <kern/debug.h>
54#include <kern/locks.h>
55
56#include "ntfs.h"
57#include "ntfs_attr.h"
58#include "ntfs_bitmap.h"
59#include "ntfs_debug.h"
60#include "ntfs_dir.h"
61#include "ntfs_endian.h"
62#include "ntfs_hash.h"
63#include "ntfs_inode.h"
64#include "ntfs_layout.h"
65#include "ntfs_lcnalloc.h"
66#include "ntfs_mft.h"
67#include "ntfs_page.h"
68#include "ntfs_secure.h"
69#include "ntfs_time.h"
70#include "ntfs_types.h"
71#include "ntfs_volume.h"
72
73/**
74 * ntfs_mft_record_map_ext - map an mft record
75 * @ni:			ntfs inode whose mft record to map
76 * @mrec:		destination pointer for the mapped mft record
77 * @mft_is_locked:	if true the caller holds the mft lock (@mft_ni->lock)
78 *
79 * The buffer containing the mft record belonging to the ntfs inode @ni is
80 * mapped which on OS X means it is held for exclusive via the BL_BUSY flag in
81 * the buffer.  The mapped mft record is returned in *@m.
82 *
83 * If @mft_is_locked is true the caller holds the mft lock (@mft_ni->lock) thus
84 * ntfs_mft_record_map_ext() will not try to take the same lock.  It is then
85 * the responsibility of the caller that the mft is consistent and stable for
86 * the duration of the call.
87 *
88 * Return 0 on success and errno on error.
89 *
90 * Note: Caller must hold an iocount reference on the vnode of the base inode
91 * of @ni.
92 */
93errno_t ntfs_mft_record_map_ext(ntfs_inode *ni, MFT_RECORD **mrec,
94		const BOOL mft_is_locked)
95{
96	ntfs_volume *vol;
97	ntfs_inode *mft_ni;
98	buf_t buf;
99	MFT_RECORD *m;
100	errno_t err;
101
102	ntfs_debug("Entering for mft_no 0x%llx (mft is %slocked).",
103			(unsigned long long)ni->mft_no,
104			mft_is_locked ? "" : "not ");
105	if (NInoAttr(ni))
106		panic("%s(): Called for attribute inode.\n", __FUNCTION__);
107	vol = ni->vol;
108	mft_ni = vol->mft_ni;
109	/*
110	 * If the volume is in the process of being unmounted then @vol->mft_ni
111	 * may have become NULL in which case we need to bail out.
112	 */
113	if (!mft_ni) {
114		/*
115		 * @vol->mp may be NULL now which is ok.  ntfs_error() deals
116		 * with this case gracefully.
117		 */
118		ntfs_error(vol->mp, "The volume is being unmounted, bailing "
119				"out (you can ignore any errors following "
120				"this one).");
121		return EINVAL;
122	}
123	/* Get an iocount reference on the $MFT vnode. */
124	err = vnode_get(mft_ni->vn);
125	if (err) {
126		ntfs_error(vol->mp, "Failed to get vnode for $MFT.");
127		return err;
128	}
129	if (!mft_is_locked)
130		lck_rw_lock_shared(&mft_ni->lock);
131	/*
132	 * If the wanted mft record number is out of bounds the mft record does
133	 * not exist.
134	 */
135	lck_spin_lock(&mft_ni->size_lock);
136	if (ni->mft_no > (ino64_t)(mft_ni->data_size >>
137			vol->mft_record_size_shift)) {
138		lck_spin_unlock(&mft_ni->size_lock);
139		ntfs_error(vol->mp, "Attempt to read mft record 0x%llx, which "
140				"is beyond the end of the mft.",
141				(unsigned long long)ni->mft_no);
142		err = ENOENT;
143		goto err;
144	}
145	lck_spin_unlock(&mft_ni->size_lock);
146	/*
147	 * We implement access to $MFT/$DATA by mapping the buffer containing
148	 * the mft record into memory using buf_meta_bread() which takes care
149	 * of reading the buffer in if it is not in memory already and removing
150	 * the mst protection fixups.
151	 *
152	 * In case we ever care, we know whether buf_meta_bread() found the
153	 * buffer already in memory or whether it read it in because in the
154	 * former case buf_fromcache(buf) will be true and in the latter case
155	 * it will be false.
156	 *
157	 * Similarly we know if the buffer was already dirty or not by checking
158	 * buf_flags(buf) & B_DELWRI.
159	 */
160	ntfs_debug("Calling buf_meta_bread().");
161	err = buf_meta_bread(mft_ni->vn, ni->mft_no, vol->mft_record_size,
162			NOCRED, &buf);
163	ntfs_debug("After buf_meta_bread().");
164	if (err) {
165		ntfs_error(vol->mp, "Failed to read buffer of mft record "
166				"0x%llx (error %d).",
167				(unsigned long long)ni->mft_no, err);
168		goto buf_err;
169	}
170	err = buf_map(buf, (caddr_t*)&m);
171	if (err) {
172		ntfs_error(vol->mp, "Failed to map buffer of mft record "
173				"0x%llx (error %d).",
174				(unsigned long long)ni->mft_no, err);
175		goto buf_err;
176	}
177	if (!m)
178		panic("%s(): buf_map() returned NULL.\n", __FUNCTION__);
179	if (ni->m_buf || ni->m)
180		panic("%s(): Mft record 0x%llx is already mapped.\n",
181				__FUNCTION__, (unsigned long long)ni->mft_no);
182	/* Catch multi sector transfer fixup errors. */
183	if (ntfs_is_mft_record(m->magic)) {
184		if (!mft_is_locked)
185			lck_rw_unlock_shared(&mft_ni->lock);
186		ni->mft_ni = mft_ni;
187		ni->m_buf = buf;
188		ni->m = m;
189		*mrec = m;
190		ntfs_debug("Done.");
191		return 0;
192	}
193	ntfs_error(vol->mp, "Mft record 0x%llx is corrupt.  Run chkdsk.",
194			(unsigned long long)ni->mft_no);
195	NVolSetErrors(vol);
196	/* Error, release the buffer. */
197	err = buf_unmap(buf);
198	if (err)
199		ntfs_error(vol->mp, "Failed to unmap buffer of mft record "
200				"0x%llx (error %d).",
201				(unsigned long long)ni->mft_no, err);
202	err = EIO;
203buf_err:
204	buf_brelse(buf);
205err:
206	/*
207	 * Release the iocount reference on the $MFT vnode.  We can ignore the
208	 * return value as it always is zero.
209	 */
210	if (!mft_is_locked)
211		lck_rw_unlock_shared(&mft_ni->lock);
212	(void)vnode_put(mft_ni->vn);
213	return err;
214}
215
216/**
217 * ntfs_mft_record_unmap - release a mapped mft record
218 * @ni:		ntfs inode whose mft record to unmap
219 *
220 * Unmap the buffer containing the mft record.
221 */
222void ntfs_mft_record_unmap(ntfs_inode *ni)
223{
224	ntfs_inode *mft_ni;
225	buf_t buf;
226	errno_t err;
227
228	ntfs_debug("Entering for mft_no 0x%llx.",
229			(unsigned long long)ni->mft_no);
230	mft_ni = ni->mft_ni;
231	buf = ni->m_buf;
232	if (!mft_ni || !buf || !ni->m)
233		panic("%s(): Mft record 0x%llx is not mapped.\n", __FUNCTION__,
234				(unsigned long long)ni->mft_no);
235	ni->mft_ni = NULL;
236	ni->m_buf = NULL;
237	ni->m = NULL;
238	err = buf_unmap(buf);
239	if (err)
240		ntfs_error(ni->vol->mp, "Failed to unmap buffer of mft record "
241				"0x%llx (error %d).",
242				(unsigned long long)ni->mft_no, err);
243	if (NInoTestClearMrecNeedsDirtying(ni)) {
244		err = buf_bdwrite(buf);
245		if (err) {
246			ntfs_error(ni->vol->mp, "Failed to write buffer of "
247					"mft record 0x%llx (error %d).  Run "
248					"chkdsk.",
249					(unsigned long long)ni->mft_no, err);
250			NVolSetErrors(ni->vol);
251		}
252	} else
253		buf_brelse(buf);
254	/*
255	 * Release the iocount reference on the $MFT vnode.  We can ignore the
256	 * return value as it always is zero.
257	 */
258	(void)vnode_put(mft_ni->vn);
259	ntfs_debug("Done.");
260}
261
262/**
263 * ntfs_extent_mft_record_map_ext - load an extent inode
264 * @base_ni:		base ntfs inode
265 * @mref:		mft reference of the extent inode to load
266 * @ext_ni:		destination pointer for the loaded ntfs inode
267 * @ext_mrec:		destination pointer for the mapped mft record
268 * @mft_is_locked:	if true the caller holds the mft lock (@mft_ni->lock)
269 *
270 * Load the extent mft record @mref and attach it to its base inode @base_ni.
271 *
272 * On success *@ext_ni contains a pointer to the ntfs inode structure of the
273 * mapped extent inode and *@ext_mrec contains a pointer to the mft record
274 * structure of the mapped extent inode.
275 *
276 * If @mft_is_locked is true the caller holds the mft lock thus
277 * ntfs_extent_mft_record_map_ext() will not try to take the same lock.  It is
278 * then the responsibility of the caller that the mft is consistent and stable
279 * for the duration of the call.
280 *
281 * Return 0 on success and errno on error.
282 *
283 * Note: The caller must hold an iocount reference on the vnode of the base
284 * inode.
285 */
286errno_t ntfs_extent_mft_record_map_ext(ntfs_inode *base_ni, MFT_REF mref,
287		ntfs_inode **ext_ni, MFT_RECORD **ext_mrec,
288		const BOOL mft_is_locked)
289{
290	ino64_t mft_no;
291	ntfs_inode **extent_nis = NULL;
292	ntfs_inode *ni = NULL;
293	MFT_RECORD *m;
294	errno_t err;
295	unsigned seq_no;
296	int i;
297	BOOL need_reclaim;
298
299	mft_no = MREF(mref);
300	seq_no = MSEQNO(mref);
301	ntfs_debug("Mapping extent mft record 0x%llx (base mft record "
302			"0x%llx).", (unsigned long long)mft_no,
303			(unsigned long long)base_ni->mft_no);
304	/*
305	 * Check if this extent inode has already been added to the base inode,
306	 * in which case just return it.  If not found, add it to the base
307	 * inode before returning it.
308	 */
309	lck_mtx_lock(&base_ni->extent_lock);
310	if (base_ni->nr_extents > 0) {
311		extent_nis = base_ni->extent_nis;
312		for (i = 0; i < base_ni->nr_extents; i++) {
313			if (mft_no != extent_nis[i]->mft_no)
314				continue;
315			ni = extent_nis[i];
316			break;
317		}
318	}
319	if (ni) {
320		lck_mtx_unlock(&base_ni->extent_lock);
321		/* We found the record.  Map and return it. */
322		err = ntfs_mft_record_map_ext(ni, &m, mft_is_locked);
323		if (!err) {
324			/* Verify the sequence number if present. */
325			if (!seq_no || le16_to_cpu(m->sequence_number) ==
326					seq_no) {
327				ntfs_debug("Done 1.");
328				*ext_ni = ni;
329				*ext_mrec = m;
330				return err;
331			}
332			ntfs_mft_record_unmap(ni);
333			ntfs_error(base_ni->vol->mp, "Found stale extent mft "
334					"reference!  Corrupt file system.  "
335					"Run chkdsk.");
336			return EIO;
337		}
338map_err_out:
339		ntfs_error(base_ni->vol->mp, "Failed to map extent mft "
340				"record (error %d).", (int)err);
341		return err;
342	}
343	/* Record was not there.  Get a new ntfs inode and initialize it. */
344	err = ntfs_extent_inode_get(base_ni, mref, &ni);
345	if (err) {
346		lck_mtx_unlock(&base_ni->extent_lock);
347		return err;
348	}
349	/* Now map the extent mft record. */
350	err = ntfs_mft_record_map_ext(ni, &m, mft_is_locked);
351	if (err) {
352		lck_mtx_unlock(&base_ni->extent_lock);
353		ntfs_inode_reclaim(ni);
354		goto map_err_out;
355	}
356	need_reclaim = FALSE;
357	/* Verify the sequence number if it is present. */
358	if (seq_no) {
359		if (le16_to_cpu(m->sequence_number) != seq_no) {
360			ntfs_error(base_ni->vol->mp, "Found stale extent mft "
361					"reference!  Corrupt file system.  "
362					"Run chkdsk.");
363			need_reclaim = TRUE;
364			err = EIO;
365			goto unm_err_out;
366		}
367	} else {
368		/*
369		 * No sequence number was specified by the caller thus set the
370		 * sequence number in the ntfs inode to the one in the mft
371		 * record.
372		 */
373		ni->seq_no = le16_to_cpu(m->sequence_number);
374	}
375	/* Attach extent inode to base inode, reallocating memory if needed. */
376	if ((base_ni->nr_extents + 1) * sizeof(ntfs_inode *) >
377			base_ni->extent_alloc) {
378		ntfs_inode **tmp;
379		int new_size;
380
381		new_size = base_ni->extent_alloc + 4 * sizeof(ntfs_inode *);
382		tmp = OSMalloc(new_size, ntfs_malloc_tag);
383		if (!tmp) {
384			ntfs_error(base_ni->vol->mp, "Failed to allocate "
385					"internal buffer.");
386			need_reclaim = TRUE;
387			err = ENOMEM;
388			goto unm_err_out;
389		}
390		if (base_ni->extent_alloc) {
391			if (base_ni->nr_extents > 0)
392				memcpy(tmp, base_ni->extent_nis,
393						base_ni->nr_extents *
394						sizeof(ntfs_inode *));
395			OSFree(base_ni->extent_nis, base_ni->extent_alloc,
396					ntfs_malloc_tag);
397		}
398		base_ni->extent_alloc = new_size;
399		base_ni->extent_nis = tmp;
400	}
401	base_ni->extent_nis[base_ni->nr_extents++] = ni;
402	lck_mtx_unlock(&base_ni->extent_lock);
403	ntfs_debug("Done 2.");
404	*ext_ni = ni;
405	*ext_mrec = m;
406	return err;
407unm_err_out:
408	ntfs_mft_record_unmap(ni);
409	lck_mtx_unlock(&base_ni->extent_lock);
410	/*
411	 * If the extent inode was not attached to the base inode we need to
412	 * release it or we will leak memory.
413	 */
414	if (need_reclaim)
415		ntfs_inode_reclaim(ni);
416	return err;
417}
418
419static const char es[] = "  Leaving inconsistent metadata.  Unmount and run "
420		"chkdsk.";
421
422/**
423 * ntfs_mft_record_sync - synchronize an inode's mft record with that on disk
424 * @ni:		ntfs inode whose mft record to synchronize to disk
425 *
426 * If the mft record belonging to the ntfs inode @ni is cached in memory and is
427 * dirty write it out.
428 *
429 * Note this function can only be called for real, base or extent, inodes, i.e.
430 * not for synthetic, attribute or index, inodes.  Failure to obey this will
431 * result in a panic.
432 *
433 * Return 0 on success and errno on error.
434 *
435 * Locking: The mft record must not be mapped or a deadlock will occur.
436 */
437errno_t ntfs_mft_record_sync(ntfs_inode *ni)
438{
439	ntfs_volume *vol;
440	ntfs_inode *mft_ni;
441	buf_t buf;
442	errno_t err;
443
444	if (NInoAttr(ni))
445		panic("%s(): Called for attribute inode.\n", __FUNCTION__);
446	ntfs_debug("Entering for mft record of %s inode 0x%llx.",
447			(ni->nr_extents >= 0) ? "base" : "extent",
448			(unsigned long long)ni->mft_no);
449	vol = ni->vol;
450	mft_ni = vol->mft_ni;
451	if (!mft_ni) {
452		ntfs_warning(vol->mp, "$MFT inode is missing from volume.");
453		return ENOTSUP;
454	}
455	/* Get an iocount reference on the $MFT vnode. */
456	err = vnode_get(mft_ni->vn);
457	if (err) {
458		ntfs_error(vol->mp, "Failed to get vnode for $MFT.");
459		return err;
460	}
461	lck_rw_lock_shared(&mft_ni->lock);
462	/*
463	 * Get the buffer if it is cached.  If it is not cached then it cannot
464	 * be dirty either thus we do not need to write it.
465	 */
466	buf = buf_getblk(mft_ni->vn, ni->mft_no, vol->mft_record_size, 0, 0,
467			BLK_META | BLK_ONLYVALID);
468	lck_rw_unlock_shared(&mft_ni->lock);
469	(void)vnode_put(mft_ni->vn);
470	if (!buf) {
471		ntfs_debug("Mft record 0x%llx is not in cache, nothing to do.",
472				(unsigned long long)ni->mft_no);
473		return 0;
474	}
475	/* The buffer must be the right size. */
476	if (buf_size(buf) != vol->mft_record_size)
477		panic("%s(): Buffer containing mft record 0x%llx has wrong "
478				"size (0x%x instead of 0x%x).", __FUNCTION__,
479				(unsigned long long)ni->mft_no,
480				buf_size(buf), vol->mft_record_size);
481	/* If the buffer is clean there is nothing to do. */
482	if (!(buf_flags(buf) & B_DELWRI)) {
483		ntfs_debug("Mft record 0x%llx is in cache but not dirty, "
484				"nothing to do.",
485				(unsigned long long)ni->mft_no);
486		buf_brelse(buf);
487		return 0;
488	}
489	/* The buffer is dirty, write it now. */
490	err = buf_bwrite(buf);
491	if (!err)
492		ntfs_debug("Done.");
493	else
494		ntfs_error(vol->mp, "Failed to write mft record 0x%llx (error "
495				"%d).", (unsigned long long)ni->mft_no, err);
496	return err;
497}
498
499/**
500 * ntfs_mft_mirror_sync - synchronize an mft record to the mft mirror
501 * @vol:	ntfs volume on which the mft record to synchronize resides
502 * @rec_no:	mft record number to synchronize
503 * @m:		mapped, mst protected (extent) mft record to synchronize
504 * @sync:	if true perform synchronous i/o otherwise use async i/o
505 *
506 * Write the mapped, mst protected (extent) mft record number @rec_no with data
507 * @m to the mft mirror ($MFTMirr) of the ntfs volume @vol.
508 *
509 * On success return 0.  On error return errno and set the volume errors flag
510 * in the ntfs volume @vol.
511 */
512errno_t ntfs_mft_mirror_sync(ntfs_volume *vol, const s64 rec_no,
513		const MFT_RECORD *m, const BOOL sync)
514{
515	s64 data_size;
516	ntfs_inode *mirr_ni;
517	vnode_t mirr_vn;
518	buf_t buf;
519	MFT_RECORD *mirr;
520	errno_t err;
521
522	ntfs_debug("Entering for rec_no 0x%llx.", (unsigned long long)rec_no);
523	mirr_ni = vol->mftmirr_ni;
524	if (!mirr_ni) {
525		/* This could happen during umount... */
526		ntfs_error(vol->mp, "Umount time mft mirror syncing is not "
527				"implemented yet.  %s", ntfs_please_email);
528		return ENOTSUP;
529	}
530	mirr_vn = mirr_ni->vn;
531	/*
532	 * Protect against changes in initialized_size and thus against
533	 * truncation also.
534	 */
535	lck_rw_lock_shared(&mirr_ni->lock);
536	if (rec_no >= vol->mftmirr_size)
537		panic("%s(): rec_no >= vol->mftmirr_size\n", __FUNCTION__);
538	err = vnode_get(mirr_vn);
539	if (err) {
540		ntfs_error(vol->mp, "Failed to get vnode for mft mirror.");
541		goto err;
542	}
543	lck_spin_lock(&mirr_ni->size_lock);
544	data_size = ubc_getsize(mirr_vn);
545	if (data_size > mirr_ni->data_size)
546		data_size = mirr_ni->data_size;
547	/* Byte offset of the mft record. */
548	if ((rec_no << vol->mft_record_size_shift) + vol->mft_record_size >
549			mirr_ni->initialized_size) {
550		lck_spin_unlock(&mirr_ni->size_lock);
551		ntfs_error(vol->mp, "Write past the initialized size of mft "
552				"mirror.");
553		err = EIO;
554		goto put;
555	}
556	lck_spin_unlock(&mirr_ni->size_lock);
557	/*
558	 * Map the buffer containing the mft mirror record.
559	 *
560	 * Note we use buf_getblk() as we do not care whether the record is
561	 * up-to-date in memory or not as we are about to overwrite it.
562	 */
563	buf = buf_getblk(mirr_vn, rec_no, vol->mft_record_size, 0, 0, BLK_META);
564	if (!buf)
565		panic("%s(): buf_getblk() returned NULL.\n", __FUNCTION__);
566	err = buf_map(buf, (caddr_t*)&mirr);
567	if (err) {
568		ntfs_error(vol->mp, "Failed to map buffer of mft mirror "
569				"record %lld (error %d).",
570				(unsigned long long)rec_no, err);
571		buf_brelse(buf);
572		goto put;
573	}
574	memcpy(mirr, m, vol->mft_record_size);
575	err = buf_unmap(buf);
576	if (err)
577		ntfs_error(vol->mp, "Failed to unmap buffer of mft mirror "
578				"record %lld (error %d).",
579				(unsigned long long)rec_no, err);
580	/*
581	 * If the i/o is synchronous use a synchronous write for the mft mirror
582	 * as well.  If the i/o is asynchronous then do the write
583	 * asynchronously.  Note we do not use a delayed write because we want
584	 * to ensure that the mft mirror will be brought up-to-date as soon as
585	 * possible because we are using delayed writes on the mft itself thus
586	 * in case of a crash we want to have a valid and up-to-date mft mirror
587	 * on disk that we can recover from even when the mft is not valid or
588	 * up-to-date.
589	 *
590	 * FIXME: For maximum performance we could delete the above comment and
591	 * change the buf_bawrite() to buf_bdwrite().
592	 */
593	if (sync)
594		err = buf_bwrite(buf);
595	else
596		err = buf_bawrite(buf);
597	if (err)
598		ntfs_error(vol->mp, "Failed to write buffer of mft mirror "
599				"record %lld (error %d).",
600				(unsigned long long)rec_no, err);
601put:
602	(void)vnode_put(mirr_vn);
603err:
604	lck_rw_unlock_shared(&mirr_ni->lock);
605	if (!err)
606		ntfs_debug("Done.");
607	else {
608		ntfs_error(vol->mp, "Failed to synchronize mft mirror (error "
609				"code %d).  Volume will be left marked dirty "
610				"on unmount.  Run chkdsk.", err);
611		NVolSetErrors(vol);
612	}
613	return err;
614}
615
616/**
617 * ntfs_mft_bitmap_find_and_alloc_free_rec_nolock - see name
618 * @vol:	volume on which to search for a free mft record
619 * @base_ni:	open base inode if allocating an extent mft record or NULL
620 * @mft_no:	destination in which to return the allocated mft record number
621 *
622 * Search for a free mft record in the mft bitmap attribute on the ntfs volume
623 * @vol and return the allocated mft record number in *@mft_no.
624 *
625 * If @base_ni is NULL start the search at the default allocator position.
626 *
627 * If @base_ni is not NULL start the search at the mft record after the base
628 * mft record @base_ni.
629 *
630 * Return 0 on success and errno on error.  An error code of ENOSPC means that
631 * there are no free mft records in the currently initialized mft bitmap.
632 *
633 * Locking: - Caller must hold @vol->mftbmp_lock for writing.
634 *	    - Caller must hold @vol->mftbmp_ni->lock.
635 */
636static errno_t ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol,
637		ntfs_inode *base_ni, s64 *mft_no)
638{
639	s64 pass_end, ll, data_pos, pass_start, ofs;
640	ntfs_inode *mftbmp_ni;
641	upl_t upl;
642	upl_page_info_array_t pl;
643	u8 *buf, *byte;
644	unsigned page_ofs, size, bit;
645	u8 pass, b;
646
647	ntfs_debug("Searching for free mft record in the currently "
648			"initialized mft bitmap.");
649	mftbmp_ni = vol->mftbmp_ni;
650	if (!mftbmp_ni)
651		panic("%s: !mftbmp_ni\n", __FUNCTION__);
652	/*
653	 * Set the end of the pass making sure we do not overflow the mft
654	 * bitmap.
655	 */
656	if (!vol->mft_ni)
657		panic("%s: !mft_ni\n", __FUNCTION__);
658	lck_spin_lock(&vol->mft_ni->size_lock);
659	pass_end = vol->mft_ni->allocated_size >> vol->mft_record_size_shift;
660	lck_spin_unlock(&vol->mft_ni->size_lock);
661	lck_spin_lock(&mftbmp_ni->size_lock);
662	ll = mftbmp_ni->initialized_size << 3;
663	lck_spin_unlock(&mftbmp_ni->size_lock);
664	if (pass_end > ll)
665		pass_end = ll;
666	pass = 1;
667	if (!base_ni)
668		data_pos = vol->mft_data_pos;
669	else
670		data_pos = base_ni->mft_no + 1;
671	if (data_pos < 24)
672		data_pos = 24;
673	if (data_pos >= pass_end) {
674		data_pos = 24;
675		pass = 2;
676		/* This happens on a freshly formatted volume. */
677		if (data_pos >= pass_end)
678			goto no_space;
679	}
680	pass_start = data_pos;
681	ntfs_debug("Starting bitmap search: pass %u, pass_start 0x%llx, "
682			"pass_end 0x%llx, data_pos 0x%llx.", (unsigned)pass,
683			(unsigned long long)pass_start,
684			(unsigned long long)pass_end,
685			(unsigned long long)data_pos);
686	/* Loop until a free mft record is found. */
687	do {
688		/* Cap size to pass_end. */
689		ofs = data_pos >> 3;
690		page_ofs = (unsigned)ofs & PAGE_MASK;
691		size = PAGE_SIZE - page_ofs;
692		ll = ((pass_end + 7) >> 3) - ofs;
693		if (size > ll)
694			size = ll;
695		size <<= 3;
696		/*
697		 * If we are still within the active pass, search the next page
698		 * for a zero bit.
699		 */
700		if (size) {
701			errno_t err;
702
703			err = ntfs_page_map(mftbmp_ni, ofs & ~PAGE_MASK_64,
704					&upl, &pl, &buf, TRUE);
705			if (err) {
706				ntfs_error(vol->mp, "Failed to read mft "
707						"bitmap, aborting.");
708				return err;
709			}
710			buf += page_ofs;
711			bit = (unsigned)data_pos & 7;
712			data_pos &= ~7ULL;
713			ntfs_debug("Before inner for loop: size 0x%x, "
714					"data_pos 0x%llx, bit 0x%x", size,
715					(unsigned long long)data_pos, bit);
716			for (; bit < size && data_pos + bit < pass_end;
717					bit &= ~7, bit += 8) {
718				byte = buf + (bit >> 3);
719				if (*byte == 0xff)
720					continue;
721				/*
722				 * TODO: There does not appear to be a ffz()
723				 * function in the kernel. )-:  If/when the
724				 * kernel has an ffz() function, switch the
725				 * below code to use it.
726				 *
727				 * So emulate "ffz(x)" using "ffs(~x) - 1"
728				 * which gives the same result but incurs extra
729				 * CPU overhead.
730				 */
731				b = ffs(~(unsigned long)*byte) - 1;
732				if (b < 8 && b >= (bit & 7)) {
733					ll = data_pos + (bit & ~7) + b;
734					if (ll > (1LL << 32)) {
735						ntfs_page_unmap(mftbmp_ni,
736								upl, pl, FALSE);
737						goto no_space;
738					}
739					*byte |= 1 << b;
740					ntfs_page_unmap(mftbmp_ni, upl, pl,
741							TRUE);
742					ntfs_debug("Done.  (Found and "
743							"allocated mft record "
744							"0x%llx.)",
745							(unsigned long long)ll);
746					*mft_no = ll;
747					return 0;
748				}
749			}
750			ntfs_debug("After inner for loop: size 0x%x, "
751					"data_pos 0x%llx, bit 0x%x", size,
752					(unsigned long long)data_pos, bit);
753			data_pos += size;
754			ntfs_page_unmap(mftbmp_ni, upl, pl, FALSE);
755			/*
756			 * If the end of the pass has not been reached yet,
757			 * continue searching the mft bitmap for a zero bit.
758			 */
759			continue;
760		}
761		/* If we just did the second pass we are done. */
762		if (pass >= 2)
763			break;
764		/*
765		 * Do the second pass, in which we scan the first part of the
766		 * zone which we omitted earlier.
767		 */
768		pass++;
769		pass_end = pass_start;
770		data_pos = pass_start = 24;
771		ntfs_debug("pass %u, pass_start 0x%llx, pass_end 0x%llx.",
772				(unsigned)pass, (unsigned long long)pass_start,
773				(unsigned long long)pass_end);
774		/*
775		 * If the end of the pass has not been reached yet, continue
776		 * searching the mft bitmap for a zero bit.
777		 */
778	} while (data_pos < pass_end);
779no_space:
780	ntfs_debug("Done.  (No free mft records left in currently initialized "
781			"mft bitmap.)");
782	return ENOSPC;
783}
784
785/**
786 * ntfs_mft_bitmap_extend_allocation_nolock - extend mft bitmap by a cluster
787 * @vol:	volume on which to extend the mft bitmap attribute
788 *
789 * Extend the mft bitmap attribute allocation on the ntfs volume @vol by one
790 * cluster.
791 *
792 * Note: Only changes allocated_size, i.e. does not touch initialized_size or
793 * data_size.
794 *
795 * Return 0 on success and errno on error.
796 *
797 * Locking: - Caller must hold @vol->mftbmp_lock for writing.
798 *	    - Caller must hold @vol->mftbmp_ni->lock for writing.
799 *	    - This function takes @vol->mftbmp_ni->rl.lock for writing and
800 *	      releases it before returning.
801 *	    - This function takes @vol->lcnbmp_lock for writing and releases it
802 *	      before returning.
803 */
804static errno_t ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
805{
806	VCN vcn, lowest_vcn = 0;
807	LCN lcn;
808	s64 allocated_size, ll;
809	ntfs_inode *mft_ni, *mftbmp_ni, *lcnbmp_ni;
810	ntfs_rl_element *rl;
811	upl_t upl;
812	upl_page_info_array_t pl;
813	u8 *kaddr, *b;
814	MFT_RECORD *m;
815	ntfs_attr_search_ctx *ctx;
816	ATTR_RECORD *a;
817	unsigned mp_size, attr_len = 0;
818	errno_t err, err2;
819	BOOL mp_rebuilt = FALSE;
820	u8 tb;
821
822	ntfs_debug("Extending mft bitmap allocation.");
823	mft_ni = vol->mft_ni;
824	mftbmp_ni = vol->mftbmp_ni;
825	lcnbmp_ni = vol->lcnbmp_ni;
826	/*
827	 * Determine the last lcn of the mft bitmap.  The allocated size of the
828	 * mft bitmap cannot be zero so we are ok to not check for it being
829	 * zero first.
830	 */
831	lck_rw_lock_exclusive(&mftbmp_ni->rl.lock);
832	lck_spin_lock(&mftbmp_ni->size_lock);
833	allocated_size = mftbmp_ni->allocated_size;
834	lck_spin_unlock(&mftbmp_ni->size_lock);
835	vcn = (allocated_size - 1) >> vol->cluster_size_shift;
836	err = ntfs_attr_find_vcn_nolock(mftbmp_ni, vcn, &rl, NULL);
837	if (err || !rl || !rl->length || rl->lcn < 0 || rl[1].length ||
838			rl[1].vcn != vcn + 1) {
839		lck_rw_unlock_exclusive(&mftbmp_ni->rl.lock);
840		ntfs_error(vol->mp, "Failed to determine last allocated "
841				"cluster of mft bitmap attribute.");
842		if (!err)
843			err = EIO;
844		return err;
845	}
846	lcn = rl->lcn + rl->length;
847	ntfs_debug("Last lcn of mft bitmap attribute is 0x%llx.",
848			(unsigned long long)lcn);
849	lck_rw_lock_exclusive(&vol->lcnbmp_lock);
850	err = vnode_get(lcnbmp_ni->vn);
851	if (err) {
852		ntfs_error(vol->mp, "Failed to get vnode for $Bitmap.");
853		lck_rw_unlock_exclusive(&vol->lcnbmp_lock);
854		lck_rw_unlock_exclusive(&mftbmp_ni->rl.lock);
855		return err;
856	}
857	lck_rw_lock_shared(&lcnbmp_ni->lock);
858	/*
859	 * Attempt to get the cluster following the last allocated cluster by
860	 * hand as it may be in the MFT zone so the allocator would not give it
861	 * to us.
862	 */
863	ll = lcn >> 3;
864	err = ntfs_page_map(lcnbmp_ni, ll & ~PAGE_MASK_64, &upl, &pl, &kaddr,
865			TRUE);
866	if (err) {
867		lck_rw_unlock_shared(&lcnbmp_ni->lock);
868		(void)vnode_put(lcnbmp_ni->vn);
869		lck_rw_unlock_exclusive(&vol->lcnbmp_lock);
870		lck_rw_unlock_exclusive(&mftbmp_ni->rl.lock);
871		ntfs_error(vol->mp, "Failed to read from lcn bitmap.");
872		return err;
873	}
874	b = kaddr + ((unsigned)ll & PAGE_MASK);
875	tb = 1 << ((unsigned)lcn & 7);
876	if (*b != 0xff && !(*b & tb)) {
877		/* Next cluster is free, allocate it. */
878		*b |= tb;
879		vol->nr_free_clusters--;
880		if (vol->nr_free_clusters < 0)
881			vol->nr_free_clusters = 0;
882		ntfs_page_unmap(lcnbmp_ni, upl, pl, TRUE);
883		lck_rw_unlock_shared(&lcnbmp_ni->lock);
884		(void)vnode_put(lcnbmp_ni->vn);
885		lck_rw_unlock_exclusive(&vol->lcnbmp_lock);
886		/* Update the mft bitmap runlist. */
887		rl->length++;
888		rl[1].vcn++;
889		ntfs_debug("Appending one cluster to mft bitmap.");
890	} else {
891		ntfs_runlist runlist;
892
893		ntfs_page_unmap(lcnbmp_ni, upl, pl, FALSE);
894		lck_rw_unlock_shared(&lcnbmp_ni->lock);
895		(void)vnode_put(lcnbmp_ni->vn);
896		lck_rw_unlock_exclusive(&vol->lcnbmp_lock);
897		/* Allocate a cluster from the DATA_ZONE. */
898		runlist.rl = NULL;
899		runlist.alloc = runlist.elements = 0;
900		err = ntfs_cluster_alloc(vol, vcn + 1, 1, lcn, DATA_ZONE,
901				TRUE, &runlist);
902		if (err) {
903			lck_rw_unlock_exclusive(&mftbmp_ni->rl.lock);
904			ntfs_error(vol->mp, "Failed to allocate a cluster for "
905					"the mft bitmap.");
906			if (err != ENOMEM && err != ENOSPC)
907				err = EIO;
908			return err;
909		}
910		err = ntfs_rl_merge(&mftbmp_ni->rl, &runlist);
911		if (err) {
912			lck_rw_unlock_exclusive(&mftbmp_ni->rl.lock);
913			ntfs_error(vol->mp, "Failed to merge runlists for mft "
914					"bitmap.");
915			if (err != ENOMEM)
916				err = EIO;
917			err2 = ntfs_cluster_free_from_rl(vol, runlist.rl, 0,
918					-1, NULL);
919			if (err2) {
920				ntfs_error(vol->mp, "Failed to release "
921						"allocated cluster (error "
922						"%d).%s", err2, es);
923				NVolSetErrors(vol);
924			}
925			OSFree(runlist.rl, runlist.alloc, ntfs_malloc_tag);
926			return err;
927		}
928		ntfs_debug("Adding one run to mft bitmap.");
929	}
930	/* Update the attribute record as well. */
931	err = ntfs_mft_record_map(mft_ni, &m);
932	if (err) {
933		ntfs_error(vol->mp, "Failed to map mft record.");
934		m = NULL;
935		ctx = NULL;
936		goto undo_alloc;
937	}
938	ctx = ntfs_attr_search_ctx_get(mft_ni, m);
939	if (!ctx) {
940		ntfs_error(vol->mp, "Failed to get search context.");
941		err = ENOMEM;
942		goto undo_alloc;
943	}
944	err = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
945			mftbmp_ni->name_len, vcn, NULL, 0, ctx);
946	if (err) {
947		ntfs_error(vol->mp, "Failed to find last attribute extent of "
948				"mft bitmap attribute.");
949		if (err == ENOENT)
950			err = EIO;
951		goto undo_alloc;
952	}
953	m = ctx->m;
954	a = ctx->a;
955	/* Find the runlist element with which the attribute extent starts. */
956	lowest_vcn = sle64_to_cpu(a->lowest_vcn);
957	rl = ntfs_rl_find_vcn_nolock(mftbmp_ni->rl.rl, lowest_vcn);
958	if (!rl)
959		panic("%s(): !rl\n", __FUNCTION__);
960	if (!rl->length)
961		panic("%s(): !rl->length\n", __FUNCTION__);
962	if (rl->lcn < LCN_HOLE)
963		panic("%s(): rl->lcn < LCN_HOLE\n", __FUNCTION__);
964	/* Get the size for the new mapping pairs array for this extent. */
965	err = ntfs_get_size_for_mapping_pairs(vol, rl, lowest_vcn, -1,
966			&mp_size);
967	if (err) {
968		ntfs_error(vol->mp, "Get size for mapping pairs failed for "
969				"mft bitmap attribute extent.");
970		goto undo_alloc;
971	}
972	/* Extend the attribute record to fit the bigger mapping pairs array. */
973	attr_len = le32_to_cpu(a->length);
974	err = ntfs_attr_record_resize(m, a, mp_size +
975			le16_to_cpu(a->mapping_pairs_offset));
976	if (err) {
977		if (err != ENOSPC) {
978			ntfs_error(vol->mp, "Failed to resize attribute "
979					"record for mft bitmap attribute.");
980			goto undo_alloc;
981		}
982		// TODO: Deal with this by moving this extent to a new mft
983		// record or by starting a new extent in a new mft record or by
984		// moving other attributes out of this mft record.
985		// Note: It will need to be a special mft record and if none of
986		// those are available it gets rather complicated...
987		ntfs_error(vol->mp, "Not enough space in this mft record to "
988				"accomodate extended mft bitmap attribute "
989				"extent.  Cannot handle this yet.");
990		err = ENOTSUP;
991		goto undo_alloc;
992	}
993	mp_rebuilt = TRUE;
994	/* Generate the mapping pairs array directly into the attr record. */
995	err = ntfs_mapping_pairs_build(vol, (s8*)a +
996			le16_to_cpu(a->mapping_pairs_offset), mp_size, rl,
997			lowest_vcn, -1, NULL);
998	if (err) {
999		ntfs_error(vol->mp, "Failed to build mapping pairs array for "
1000				"mft bitmap attribute (error %d).", err);
1001		err = EIO;
1002		goto dirty_undo_alloc;
1003	}
1004	/* Update the highest_vcn. */
1005	a->highest_vcn = cpu_to_sle64(vcn + 1);
1006	/*
1007	 * We now have extended the mft bitmap allocated_size by one cluster.
1008	 * Reflect this in the ntfs_inode structure and the attribute record.
1009	 */
1010	if (a->lowest_vcn) {
1011		/*
1012		 * We are not in the first attribute extent, switch to it, but
1013		 * first ensure the changes will make it to disk later.
1014		 */
1015		NInoSetMrecNeedsDirtying(ctx->ni);
1016		ntfs_attr_search_ctx_reinit(ctx);
1017		err = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1018				mftbmp_ni->name_len, 0, NULL, 0, ctx);
1019		if (err)
1020			goto restore_undo_alloc;
1021		/* @m is not used any more so no need to set it. */
1022		a = ctx->a;
1023	}
1024	lck_spin_lock(&mftbmp_ni->size_lock);
1025	mftbmp_ni->allocated_size += vol->cluster_size;
1026	a->allocated_size = cpu_to_sle64(mftbmp_ni->allocated_size);
1027	lck_spin_unlock(&mftbmp_ni->size_lock);
1028	/* Ensure the changes make it to disk. */
1029	NInoSetMrecNeedsDirtying(ctx->ni);
1030	ntfs_attr_search_ctx_put(ctx);
1031	ntfs_mft_record_unmap(mft_ni);
1032	lck_rw_unlock_exclusive(&mftbmp_ni->rl.lock);
1033	ntfs_debug("Done.");
1034	return 0;
1035restore_undo_alloc:
1036	ntfs_error(vol->mp, "Failed to find first attribute extent of mft "
1037			"bitmap attribute.");
1038	if (err == ENOENT)
1039		err = EIO;
1040	ntfs_attr_search_ctx_reinit(ctx);
1041	err2 = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1042			mftbmp_ni->name_len, vcn, NULL, 0, ctx);
1043	if (err2) {
1044		ntfs_error(vol->mp, "Failed to find last attribute extent of "
1045				"mft bitmap attribute (error %d).%s", err2, es);
1046		lck_spin_lock(&mftbmp_ni->size_lock);
1047		mftbmp_ni->allocated_size += vol->cluster_size;
1048		lck_spin_unlock(&mftbmp_ni->size_lock);
1049		ntfs_attr_search_ctx_put(ctx);
1050		ntfs_mft_record_unmap(mft_ni);
1051		lck_rw_unlock_exclusive(&mftbmp_ni->rl.lock);
1052		/*
1053		 * The only thing that is now wrong is the allocated size of the
1054		 * base attribute extent which chkdsk should be able to fix.
1055		 */
1056		NVolSetErrors(vol);
1057		return err;
1058	}
1059	ctx->a->highest_vcn = cpu_to_sle64(vcn);
1060dirty_undo_alloc:
1061	/*
1062	 * Need to mark the mft record for dirtying because ntfs_cluster_free()
1063	 * may drop the mft record on the floor otherwise.
1064	 */
1065	NInoSetMrecNeedsDirtying(ctx->ni);
1066undo_alloc:
1067	err2 = ntfs_cluster_free(mftbmp_ni, vcn + 1, -1, ctx, NULL);
1068	if (err2 || ctx->is_error) {
1069		ntfs_error(vol->mp, "Failed to release allocated cluster in "
1070				"error code path (error %d).%s",
1071				ctx->is_error ? ctx->error : err2, es);
1072		NVolSetErrors(vol);
1073	}
1074	/*
1075	 * If the runlist truncation fails and/or the search context is no
1076	 * longer valid, we cannot resize the attribute record or build the
1077	 * mapping pairs array thus we mark the volume dirty and tell the user
1078	 * to run chkdsk.
1079	 */
1080	err2 = ntfs_rl_truncate_nolock(vol, &mftbmp_ni->rl, vcn + 1);
1081	if (err2) {
1082		ntfs_error(vol->mp, "Failed to truncate attribute runlist s "
1083				"in error code path (error %d).%s", err2, es);
1084		NVolSetErrors(vol);
1085	} else if (mp_rebuilt) {
1086		a = ctx->a;
1087		err2 = ntfs_attr_record_resize(ctx->m, a, attr_len);
1088		if (err2) {
1089			ntfs_error(vol->mp, "Failed to restore attribute "
1090					"record in error code path (error "
1091					"%d).%s", err2, es);
1092			NVolSetErrors(vol);
1093		} else /* if (!err2) */ {
1094			u16 mp_ofs = le16_to_cpu(a->mapping_pairs_offset);
1095			err2 = ntfs_mapping_pairs_build(vol, (s8*)a + mp_ofs,
1096					attr_len - mp_ofs, mftbmp_ni->rl.rl,
1097					lowest_vcn, -1, NULL);
1098			if (err2) {
1099				ntfs_error(vol->mp, "Failed to restore "
1100						"mapping pairs array in error "
1101						"code path (error %d).%s",
1102						err2, es);
1103				NVolSetErrors(vol);
1104			}
1105			NInoSetMrecNeedsDirtying(ctx->ni);
1106		}
1107	}
1108	if (ctx)
1109		ntfs_attr_search_ctx_put(ctx);
1110	if (m)
1111		ntfs_mft_record_unmap(mft_ni);
1112	lck_rw_unlock_exclusive(&mftbmp_ni->rl.lock);
1113	return err;
1114}
1115
1116/**
1117 * ntfs_mft_bitmap_extend_initialized_nolock - extend mftbmp initialized data
1118 * @vol:	volume on which to extend the mft bitmap attribute
1119 *
1120 * Extend the initialized portion of the mft bitmap attribute on the ntfs
1121 * volume @vol by 8 bytes.
1122 *
1123 * Note: Only changes initialized_size and data_size, i.e. requires that
1124 * allocated_size is big enough to fit the new initialized_size.
1125 *
1126 * Return 0 on success and error on error.
1127 *
1128 * Locking: - Caller must hold @vol->mftbmp_lock for writing.
1129 *	    - Caller must hold @vol->mftbmp_ni->lock for writing.
1130 */
1131static errno_t ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol)
1132{
1133	s64 old_data_size, old_initialized_size;
1134	ntfs_inode *mft_ni, *mftbmp_ni;
1135	MFT_RECORD *m;
1136	ntfs_attr_search_ctx *ctx;
1137	ATTR_RECORD *a;
1138	errno_t err, err2;
1139
1140	ntfs_debug("Extending mft bitmap initiailized (and data) size.");
1141	mft_ni = vol->mft_ni;
1142	mftbmp_ni = vol->mftbmp_ni;
1143	/* Get the attribute record. */
1144	err = ntfs_mft_record_map(mft_ni, &m);
1145	if (err) {
1146		ntfs_error(vol->mp, "Failed to map mft record.");
1147		return err;
1148	}
1149	ctx = ntfs_attr_search_ctx_get(mft_ni, m);
1150	if (!ctx) {
1151		ntfs_error(vol->mp, "Failed to get search context.");
1152		err = ENOMEM;
1153		goto unm_err;
1154	}
1155	err = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1156			mftbmp_ni->name_len, 0, NULL, 0, ctx);
1157	if (err) {
1158		ntfs_error(vol->mp, "Failed to find first attribute extent of "
1159				"mft bitmap attribute.");
1160		if (err == ENOENT)
1161			err = EIO;
1162		goto put_err;
1163	}
1164	a = ctx->a;
1165	lck_spin_lock(&mftbmp_ni->size_lock);
1166	old_data_size = mftbmp_ni->data_size;
1167	old_initialized_size = mftbmp_ni->initialized_size;
1168	/*
1169	 * We can simply update the initialized_size before filling the space
1170	 * with zeroes because the caller is holding the mft bitmap lock for
1171	 * writing which ensures that no one else is trying to access the data.
1172	 */
1173	mftbmp_ni->initialized_size += 8;
1174	a->initialized_size = cpu_to_sle64(mftbmp_ni->initialized_size);
1175	if (mftbmp_ni->initialized_size > old_data_size) {
1176		const s64 init_size = mftbmp_ni->initialized_size;
1177		mftbmp_ni->data_size = init_size;
1178		a->data_size = cpu_to_sle64(init_size);
1179		lck_spin_unlock(&mftbmp_ni->size_lock);
1180		if (!ubc_setsize(mftbmp_ni->vn, init_size))
1181			panic("%s(): !ubc_setsize(mftbmp_ni->vn, init_size)\n",
1182					__FUNCTION__);
1183	} else
1184		lck_spin_unlock(&mftbmp_ni->size_lock);
1185	/* Ensure the changes make it to disk. */
1186	NInoSetMrecNeedsDirtying(ctx->ni);
1187	ntfs_attr_search_ctx_put(ctx);
1188	ntfs_mft_record_unmap(mft_ni);
1189	/* Initialize the mft bitmap attribute value with zeroes. */
1190	err = ntfs_attr_set(mftbmp_ni, old_initialized_size, 8, 0);
1191	if (!err) {
1192		ntfs_debug("Done.  (Wrote eight initialized bytes to mft "
1193				"bitmap.");
1194		return 0;
1195	}
1196	ntfs_error(vol->mp, "Failed to write to mft bitmap.");
1197	/* Try to recover from the error. */
1198	err2 = ntfs_mft_record_map(mft_ni, &m);
1199	if (err2) {
1200		ntfs_error(vol->mp, "Failed to map mft record in error code "
1201				"path (error %d).%s", err2, es);
1202		NVolSetErrors(vol);
1203		return err;
1204	}
1205	ctx = ntfs_attr_search_ctx_get(mft_ni, m);
1206	if (!ctx) {
1207		ntfs_error(vol->mp, "Failed to get search context.%s", es);
1208		NVolSetErrors(vol);
1209		goto unm_err;
1210	}
1211	err2 = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1212			mftbmp_ni->name_len, 0, NULL, 0, ctx);
1213	if (err2) {
1214		ntfs_error(vol->mp, "Failed to find first attribute extent of "
1215				"mft bitmap attribute in error code path "
1216				"(error %d).%s", err2, es);
1217		NVolSetErrors(vol);
1218		goto put_err;
1219	}
1220	a = ctx->a;
1221	lck_spin_lock(&mftbmp_ni->size_lock);
1222	mftbmp_ni->initialized_size = old_initialized_size;
1223	a->initialized_size = cpu_to_sle64(old_initialized_size);
1224	if (ubc_getsize(mftbmp_ni->vn) != old_data_size) {
1225		mftbmp_ni->data_size = old_data_size;
1226		a->data_size = cpu_to_sle64(old_data_size);
1227		lck_spin_unlock(&mftbmp_ni->size_lock);
1228		if (!ubc_setsize(mftbmp_ni->vn, old_data_size))
1229			ntfs_error(vol->mp, "Failed to restore UBC size.  "
1230					"Leaving UBC size out of sync with "
1231					"attribute data size.");
1232	} else
1233		lck_spin_unlock(&mftbmp_ni->size_lock);
1234	NInoSetMrecNeedsDirtying(ctx->ni);
1235#ifdef DEBUG
1236	lck_spin_lock(&mftbmp_ni->size_lock);
1237	ntfs_debug("Restored status of mftbmp: allocated_size 0x%llx, "
1238			"data_size 0x%llx, initialized_size 0x%llx.",
1239			(unsigned long long)mftbmp_ni->allocated_size,
1240			(unsigned long long)mftbmp_ni->data_size,
1241			(unsigned long long)mftbmp_ni->initialized_size);
1242	lck_spin_unlock(&mftbmp_ni->size_lock);
1243#endif /* DEBUG */
1244put_err:
1245	ntfs_attr_search_ctx_put(ctx);
1246unm_err:
1247	ntfs_mft_record_unmap(mft_ni);
1248	return err;
1249}
1250
1251/**
1252 * ntfs_mft_data_extend_allocation_nolock - extend mft data attribute
1253 * @vol:	volume on which to extend the mft data attribute
1254 *
1255 * Extend the mft data attribute on the ntfs volume @vol by 16 mft records
1256 * worth of clusters or if not enough space for this by one mft record worth
1257 * of clusters.
1258 *
1259 * Note: Only changes allocated_size, i.e. does not touch initialized_size or
1260 * data_size.
1261 *
1262 * Return 0 on success and errno on error.
1263 *
1264 * Locking: - Caller must hold @vol->mftbmp_lock for writing.
1265 *	    - Caller must hold @vol->mft_ni->lock for writing.
1266 *	    - This function takes @vol->mft_ni->rl.lock for writing and
1267 *	      releases it before returning.
1268 *	    - This function calls functions which take @vol->lcnbmp_lock for
1269 *	      writing and release it before returning.
1270 */
1271static errno_t ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
1272{
1273	VCN vcn, lowest_vcn = 0;
1274	LCN lcn;
1275	s64 allocated_size, min_nr, nr;
1276	ntfs_inode *mft_ni;
1277	ntfs_rl_element *rl;
1278	MFT_RECORD *m;
1279	ntfs_attr_search_ctx *ctx;
1280	ATTR_RECORD *a;
1281	unsigned mp_size, attr_len = 0;
1282	errno_t err, err2;
1283	BOOL mp_rebuilt = FALSE;
1284	ntfs_runlist runlist;
1285
1286	ntfs_debug("Extending mft data allocation.");
1287	mft_ni = vol->mft_ni;
1288	lck_spin_lock(&mft_ni->size_lock);
1289	allocated_size = mft_ni->allocated_size;
1290	lck_spin_unlock(&mft_ni->size_lock);
1291	vcn = (allocated_size - 1) >> vol->cluster_size_shift;
1292	/*
1293	 * Determine the preferred allocation location, i.e. the last lcn of
1294	 * the mft data attribute.
1295	 */
1296	lck_rw_lock_exclusive(&mft_ni->rl.lock);
1297	if (mft_ni->rl.elements > 1)
1298		rl = &mft_ni->rl.rl[mft_ni->rl.elements - 2];
1299	else
1300		rl = mft_ni->rl.rl;
1301	if (!rl || !rl->length || rl->lcn < 0 || rl[1].length ||
1302			rl[1].vcn != vcn + 1) {
1303		ntfs_error(vol->mp, "Failed to determine last allocated "
1304				"cluster of mft data attribute.");
1305		lck_rw_unlock_exclusive(&mft_ni->rl.lock);
1306		return EIO;
1307	}
1308	lcn = rl->lcn + rl->length;
1309	ntfs_debug("Last lcn of mft data attribute is 0x%llx.",
1310			(unsigned long long)lcn);
1311	/* Minimum allocation is one mft record worth of clusters. */
1312	min_nr = vol->mft_record_size >> vol->cluster_size_shift;
1313	if (!min_nr)
1314		min_nr = 1;
1315	/* Want to allocate 16 mft records worth of clusters. */
1316	nr = (vol->mft_record_size * 16) / vol->cluster_size;
1317	if (!nr)
1318		nr = min_nr;
1319	/*
1320	 * To be in line with what Windows allows we restrict the total number
1321	 * of mft records to 2^32.
1322	 */
1323	if ((allocated_size + (nr << vol->cluster_size_shift)) >>
1324			vol->mft_record_size_shift >= (1LL << 32)) {
1325		nr = min_nr;
1326		if ((allocated_size + (nr << vol->cluster_size_shift)) >>
1327				vol->mft_record_size_shift >= (1LL << 32)) {
1328			ntfs_warning(vol->mp, "Cannot allocate mft record "
1329					"because the maximum number of inodes "
1330					"(2^32) has already been reached.");
1331			lck_rw_unlock_exclusive(&mft_ni->rl.lock);
1332			return ENOSPC;
1333		}
1334	}
1335	ntfs_debug("Trying mft data allocation with %s cluster count %lld.",
1336			nr > min_nr ? "default" : "minimal", (long long)nr);
1337	do {
1338		runlist.rl = NULL;
1339		runlist.alloc = runlist.elements = 0;
1340		/*
1341		 * We have taken the mft lock for writing.  This is not a
1342		 * problem as ntfs_cluster_alloc() only needs to access pages
1343		 * from the cluster bitmap (vol->lcnbmp_ni) and we have mapped
1344		 * the whole runlist for the cluster bitmap at mount time thus
1345		 * ntfs_page_map() will never need to map an mft record and
1346		 * hence will never need to take the mft lock.
1347		 */
1348		err = ntfs_cluster_alloc(vol, vcn + 1, nr, lcn, MFT_ZONE,
1349				TRUE, &runlist);
1350		if (!err)
1351			break;
1352		if (err != ENOSPC || nr == min_nr) {
1353			if (err != ENOMEM && err != ENOSPC)
1354				err = EIO;
1355			ntfs_error(vol->mp, "Failed to allocate the minimal "
1356					"number of clusters (%lld) for the "
1357					"mft data attribute.", (long long)nr);
1358			lck_rw_unlock_exclusive(&mft_ni->rl.lock);
1359			return err;
1360		}
1361		/*
1362		 * There is not enough space to do the allocation, but there
1363		 * might be enough space to do a minimal allocation so try that
1364		 * before failing.
1365		 */
1366		nr = min_nr;
1367		ntfs_debug("Retrying mft data allocation with minimal cluster "
1368				"count %lld.", (long long)nr);
1369	} while (1);
1370	/*
1371	 * Merge the existing runlist with the new one describing the allocated
1372	 * clusters.
1373	 */
1374	err = ntfs_rl_merge(&mft_ni->rl, &runlist);
1375	if (err) {
1376		lck_rw_unlock_exclusive(&mft_ni->rl.lock);
1377		ntfs_error(vol->mp, "Failed to merge runlists for mft data "
1378				"attribute.");
1379		if (err != ENOMEM)
1380			err = EIO;
1381		err2 = ntfs_cluster_free_from_rl(vol, runlist.rl, 0, -1, NULL);
1382		if (err2) {
1383			ntfs_error(vol->mp, "Failed to release allocated "
1384					"cluster(s) (error %d).%s", err2, es);
1385			NVolSetErrors(vol);
1386		}
1387		OSFree(runlist.rl, runlist.alloc, ntfs_malloc_tag);
1388		return err;
1389	}
1390	ntfs_debug("Allocated %lld clusters.", (long long)nr);
1391	lck_spin_lock(&mft_ni->size_lock);
1392	mft_ni->allocated_size += nr << vol->cluster_size_shift;
1393	lck_spin_unlock(&mft_ni->size_lock);
1394	/*
1395	 * We now have to drop the runlist lock again or we can deadlock with
1396	 * the below mapping of the mft record belonging to $MFT.
1397	 *
1398	 * Again as explained above the mft cannot change under us so we leave
1399	 * the runlist unlocked.
1400	 */
1401	lck_rw_unlock_exclusive(&mft_ni->rl.lock);
1402	/*
1403	 * Update the attribute record as well.
1404	 *
1405	 * When mapping the mft record for the mft we communicate the fact that
1406	 * we hold the lock on the mft inode @mft_ni->lock for writing so it
1407	 * does not try to take the lock.
1408	 */
1409	err = ntfs_mft_record_map_ext(mft_ni, &m, TRUE);
1410	if (err) {
1411		ntfs_error(vol->mp, "Failed to map mft record.");
1412		m = NULL;
1413		ctx = NULL;
1414		goto undo_alloc;
1415	}
1416	ctx = ntfs_attr_search_ctx_get(mft_ni, m);
1417	if (!ctx) {
1418		ntfs_error(vol->mp, "Failed to get search context.");
1419		err = ENOMEM;
1420		goto undo_alloc;
1421	}
1422	/*
1423	 * We have the mft lock taken for write.  Communicate this fact to
1424	 * ntfs_attr_lookup() and hence to ntfs_extent_mft_record_map_ext() and
1425	 * ntfs_mft_record_map_ext() so that they know not to try to take the
1426	 * same lock.
1427	 */
1428	ctx->is_mft_locked = 1;
1429	err = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
1430			vcn, NULL, 0, ctx);
1431	if (err) {
1432		ntfs_error(vol->mp, "Failed to find last attribute extent of "
1433				"mft data attribute.");
1434		if (err == ENOENT)
1435			err = EIO;
1436		goto undo_alloc;
1437	}
1438	m = ctx->m;
1439	a = ctx->a;
1440	/* Find the runlist element with which the attribute extent starts. */
1441	lowest_vcn = sle64_to_cpu(a->lowest_vcn);
1442	rl = ntfs_rl_find_vcn_nolock(mft_ni->rl.rl, lowest_vcn);
1443	if (!rl)
1444		panic("%s(): !rl\n", __FUNCTION__);
1445	if (!rl->length)
1446		panic("%s(): !rl->length\n", __FUNCTION__);
1447	if (rl->lcn < LCN_HOLE)
1448		panic("%s(): rl->lcn < LCN_HOLE\n", __FUNCTION__);
1449	/* Get the size for the new mapping pairs array for this extent. */
1450	err = ntfs_get_size_for_mapping_pairs(vol, rl, lowest_vcn, -1,
1451			&mp_size);
1452	if (err) {
1453		ntfs_error(vol->mp, "Get size for mapping pairs failed for "
1454				"mft data attribute extent.");
1455		goto undo_alloc;
1456	}
1457	/* Extend the attribute record to fit the bigger mapping pairs array. */
1458	attr_len = (int)le32_to_cpu(a->length);
1459	err = ntfs_attr_record_resize(m, a, mp_size +
1460			le16_to_cpu(a->mapping_pairs_offset));
1461	if (err) {
1462		if (err != ENOSPC) {
1463			ntfs_error(vol->mp, "Failed to resize attribute "
1464					"record for mft data attribute.");
1465			goto undo_alloc;
1466		}
1467		// TODO: Deal with this by moving this extent to a new mft
1468		// record or by starting a new extent in a new mft record or by
1469		// moving other attributes out of this mft record.
1470		// Note: Use the special reserved mft records and ensure that
1471		// this extent is not required to find the mft record in
1472		// question.  If no free special records left we would need to
1473		// move an existing record away, insert ours in its place, and
1474		// then place the moved record into the newly allocated space
1475		// and we would then need to update all references to this mft
1476		// record appropriately.  This is rather complicated...
1477		ntfs_error(vol->mp, "Not enough space in this mft record to "
1478				"accomodate extended mft data attribute "
1479				"extent.  Cannot handle this yet.");
1480		err = ENOTSUP;
1481		goto undo_alloc;
1482	}
1483	mp_rebuilt = TRUE;
1484	/* Generate the mapping pairs array directly into the attr record. */
1485	err = ntfs_mapping_pairs_build(vol, (s8*)a +
1486			le16_to_cpu(a->mapping_pairs_offset), mp_size, rl,
1487			lowest_vcn, -1, NULL);
1488	if (err) {
1489		ntfs_error(vol->mp, "Failed to build mapping pairs array of "
1490				"mft data attribute (error %d).", err);
1491		err = EIO;
1492		goto dirty_undo_alloc;
1493	}
1494	/* Update the highest_vcn. */
1495	a->highest_vcn = cpu_to_sle64(vcn + nr);
1496	/*
1497	 * We now have extended the mft data allocated_size by @nr clusters.
1498	 * Reflect this in the ntfs_inode structure and the attribute record.
1499	 */
1500	if (a->lowest_vcn) {
1501		/*
1502		 * We are not in the first attribute extent, switch to it, but
1503		 * first ensure the changes will make it to disk later.
1504		 */
1505		NInoSetMrecNeedsDirtying(ctx->ni);
1506		/*
1507		 * The reinitialization will preserve the is_mft_locked flag in
1508		 * the search context thus we do not need to set it again.
1509		 */
1510		ntfs_attr_search_ctx_reinit(ctx);
1511		err = ntfs_attr_lookup(mft_ni->type, mft_ni->name,
1512				mft_ni->name_len, 0, NULL, 0, ctx);
1513		if (err)
1514			goto restore_undo_alloc;
1515		/* @m is not used any more so no need to set it. */
1516		a = ctx->a;
1517	}
1518	a->allocated_size = cpu_to_sle64(mft_ni->allocated_size);
1519	/* Ensure the changes make it to disk. */
1520	NInoSetMrecNeedsDirtying(ctx->ni);
1521	ntfs_attr_search_ctx_put(ctx);
1522	ntfs_mft_record_unmap(mft_ni);
1523	/*
1524	 * We have modified the size of the base inode, cause the sizes to be
1525	 * written to all the directory index entries pointing to the base
1526	 * inode when the inode is written to disk.
1527	 */
1528	NInoSetDirtySizes(mft_ni);
1529	ntfs_debug("Done.");
1530	return 0;
1531restore_undo_alloc:
1532	ntfs_error(vol->mp, "Failed to find first attribute extent of mft "
1533			"data attribute.");
1534	if (err == ENOENT)
1535		err = EIO;
1536	/*
1537	 * The reinitialization will preserve the is_mft_locked flag in the
1538	 * search context thus we do not need to set it again.
1539	 */
1540	ntfs_attr_search_ctx_reinit(ctx);
1541	err2 = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
1542			vcn, NULL, 0, ctx);
1543	if (err2) {
1544		ntfs_error(vol->mp, "Failed to find last attribute extent of "
1545				"mft data attribute (error %d).%s", err2, es);
1546		ntfs_attr_search_ctx_put(ctx);
1547		ntfs_mft_record_unmap(mft_ni);
1548		/*
1549		 * The only thing that is now wrong is the allocated size of the
1550		 * base attribute extent which chkdsk should be able to fix.
1551		 */
1552		NVolSetErrors(vol);
1553		return err;
1554	}
1555	ctx->a->highest_vcn = cpu_to_sle64(vcn);
1556dirty_undo_alloc:
1557	/*
1558	 * Need to mark the mft record for dirtying because ntfs_cluster_free()
1559	 * may drop the mft record on the floor otherwise.
1560	 */
1561	NInoSetMrecNeedsDirtying(ctx->ni);
1562undo_alloc:
1563	err2 = ntfs_cluster_free(mft_ni, vcn + 1, -1, ctx, NULL);
1564	if (err2 || ctx->is_error) {
1565		ntfs_error(vol->mp, "Failed to release allocated cluster(s) "
1566				"in error code path (error %d).%s",
1567				ctx->is_error ? ctx->error : err2, es);
1568		NVolSetErrors(vol);
1569	}
1570	/*
1571	 * If the runlist truncation fails and/or the search context is no
1572	 * longer valid, we cannot resize the attribute record or build the
1573	 * mapping pairs array thus we mark the volume dirty and tell the user
1574	 * to run chkdsk.
1575	 *
1576	 * As before, we are going to update the runlist now so we need to take
1577	 * the runlist lock for writing.
1578	 */
1579	lck_rw_lock_exclusive(&mft_ni->rl.lock);
1580	lck_spin_lock(&mft_ni->size_lock);
1581	mft_ni->allocated_size -= nr << vol->cluster_size_shift;
1582	lck_spin_unlock(&mft_ni->size_lock);
1583	err2 = ntfs_rl_truncate_nolock(vol, &mft_ni->rl, vcn + 1);
1584	lck_rw_unlock_exclusive(&mft_ni->rl.lock);
1585	if (err2) {
1586		ntfs_error(vol->mp, "Failed to truncate attribute runlist s "
1587				"in error code path (error %d).%s", err2, es);
1588		NVolSetErrors(vol);
1589	} else if (mp_rebuilt) {
1590		a = ctx->a;
1591		err2 = ntfs_attr_record_resize(ctx->m, a, attr_len);
1592		if (err2) {
1593			ntfs_error(vol->mp, "Failed to restore attribute "
1594					"record in error code path (error "
1595					"%d).%s", err2, es);
1596			NVolSetErrors(vol);
1597		} else /* if (!err2) */ {
1598			u16 mp_ofs = le16_to_cpu(a->mapping_pairs_offset);
1599			err2 = ntfs_mapping_pairs_build(vol, (s8*)a + mp_ofs,
1600					attr_len - mp_ofs, mft_ni->rl.rl,
1601					lowest_vcn, -1, NULL);
1602			if (err2) {
1603				ntfs_error(vol->mp, "Failed to restore "
1604						"mapping pairs array in error "
1605						"code path (error %d).%s",
1606						err2, es);
1607				NVolSetErrors(vol);
1608			}
1609			NInoSetMrecNeedsDirtying(ctx->ni);
1610		}
1611	}
1612	if (ctx)
1613		ntfs_attr_search_ctx_put(ctx);
1614	if (m)
1615		ntfs_mft_record_unmap(mft_ni);
1616	return err;
1617}
1618
1619/**
1620 * ntfs_mft_record_lay_out - lay out an mft record into a memory buffer
1621 * @vol:	volume to which the mft record will belong
1622 * @mft_no:	mft record number of record to lay out
1623 * @m:		destination buffer of size >= @vol->mft_record_size bytes
1624 *
1625 * Lay out an empty, unused mft record with the mft record number @mft_no into
1626 * the buffer @m.  The volume @vol is needed because the mft record structure
1627 * was modified in NTFS 3.1 so we need to know which volume version this mft
1628 * record will be used on and also we need to know the size of an mft record.
1629 *
1630 * Return 0 on success and errno on error.
1631 */
1632static errno_t ntfs_mft_record_lay_out(const ntfs_volume *vol,
1633		const s64 mft_no, MFT_RECORD *m)
1634{
1635	ATTR_RECORD *a;
1636
1637	ntfs_debug("Entering for mft record 0x%llx.",
1638			(unsigned long long)mft_no);
1639	if (mft_no >= (1LL << 32)) {
1640		ntfs_error(vol->mp, "Mft record number 0x%llx exceeds "
1641				"maximum of 2^32.",
1642				(unsigned long long)mft_no);
1643		return ERANGE;
1644	}
1645	if (vol->mft_record_size < NTFS_BLOCK_SIZE)
1646		panic("%s(): vol->mft_record_size < NTFS_BLOCK_SIZE\n",
1647				__FUNCTION__);
1648	/* Start by clearing the whole mft record to give us a clean slate. */
1649	bzero(m, vol->mft_record_size);
1650	/* Aligned to 2-byte boundary. */
1651	if (vol->major_ver < 3 || (vol->major_ver == 3 && !vol->minor_ver))
1652		m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD_OLD) + 1) & ~1);
1653	else {
1654		m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD) + 1) & ~1);
1655		/*
1656		 * Set the NTFS 3.1+ specific fields while we know that the
1657		 * volume version is 3.1+.
1658		 */
1659		/* m->reserved = 0; */
1660		m->mft_record_number = cpu_to_le32((u32)mft_no);
1661	}
1662	m->magic = magic_FILE;
1663	m->usa_count = cpu_to_le16(1 + vol->mft_record_size / NTFS_BLOCK_SIZE);
1664	/* Set the update sequence number to 1. */
1665	*(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = cpu_to_le16(1);
1666	/* m->lsn = 0; */
1667	m->sequence_number = cpu_to_le16(1);
1668	/* m->link_count = 0; */
1669	/*
1670	 * Place the attributes straight after the update sequence array,
1671	 * aligned to 8-byte boundary.
1672	 */
1673	m->attrs_offset = cpu_to_le16((le16_to_cpu(m->usa_ofs) +
1674			(le16_to_cpu(m->usa_count) << 1) + 7) & ~7);
1675	/* m->flags = 0; */
1676	/*
1677	 * Using attrs_offset plus eight bytes (for the termination attribute).
1678	 * attrs_offset is already aligned to 8-byte boundary, so no need to
1679	 * align again.
1680	 */
1681	m->bytes_in_use = cpu_to_le32(le16_to_cpu(m->attrs_offset) + 8);
1682	m->bytes_allocated = cpu_to_le32(vol->mft_record_size);
1683	/* m->base_mft_record = 0; */
1684	/* m->next_attr_instance = 0; */
1685	/* Add the termination attribute. */
1686	a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset));
1687	a->type = AT_END;
1688	/* a->length = 0; */
1689	ntfs_debug("Done.");
1690	return 0;
1691}
1692
1693/**
1694 * ntfs_mft_record_format - format an mft record on an ntfs volume
1695 * @vol:			volume on which to format the mft record
1696 * @mft_no:			mft record number to format
1697 * @new_initialized_size:	new initialized size to assign to @vol->mft_ni
1698 *
1699 * Format the mft record @mft_no in $MFT/$DATA, i.e. lay out an empty, unused
1700 * mft record into the appropriate place of the mft data attribute.  This is
1701 * used when extending the mft data attribute.
1702 *
1703 * Once the mft record is layed out the initialized size of @vol->mft_ni is
1704 * updated to @new_initalized_size.  This must be bigger or equal to the old
1705 * initialized size and smaller or equal to the data size.
1706 *
1707 * Return 0 on success and errno on error.
1708 *
1709 * Locking: Caller must hold @vol->mft_ni->lock.
1710 */
1711static errno_t ntfs_mft_record_format(ntfs_volume *vol, const s64 mft_no,
1712		const s64 new_initialized_size)
1713{
1714	ntfs_inode *mft_ni;
1715	buf_t buf;
1716	MFT_RECORD *m;
1717	errno_t err, err2;
1718
1719	ntfs_debug("Entering for mft record 0x%llx.",
1720			(unsigned long long)mft_no);
1721	mft_ni = vol->mft_ni;
1722	/* The maximum valid offset into the VM page cache for $MFT's data. */
1723	if ((mft_no << vol->mft_record_size_shift) + vol->mft_record_size >
1724			ubc_getsize(mft_ni->vn)) {
1725		ntfs_error(vol->mp, "Tried to format non-existing mft "
1726				"record 0x%llx.", (unsigned long long)mft_no);
1727		return ENOENT;
1728	}
1729	/* Read and map the buffer containing the mft record. */
1730	err = buf_meta_bread(mft_ni->vn, mft_no, vol->mft_record_size, NOCRED,
1731			&buf);
1732	if (err) {
1733		ntfs_error(vol->mp, "Failed to read buffer of mft record "
1734				"0x%llx (error %d).",
1735				(unsigned long long)mft_no, err);
1736		goto brelse;
1737	}
1738	err = buf_map(buf, (caddr_t*)&m);
1739	if (err) {
1740		ntfs_error(vol->mp, "Failed to map buffer of mft record "
1741				"0x%llx (error %d).",
1742				(unsigned long long)mft_no, err);
1743		goto brelse;
1744	}
1745	err = ntfs_mft_record_lay_out(vol, mft_no, m);
1746	if (err) {
1747		ntfs_error(vol->mp, "Failed to lay out mft record 0x%llx "
1748				"(error %d).", (unsigned long long)mft_no, err);
1749		goto unmap;
1750	}
1751	err = buf_unmap(buf);
1752	if (err) {
1753		ntfs_error(vol->mp, "Failed to unmap buffer of mft record "
1754				"0x%llx (error %d).",
1755				(unsigned long long)mft_no, err);
1756		goto brelse;
1757	}
1758	lck_spin_lock(&mft_ni->size_lock);
1759	if (new_initialized_size < mft_ni->initialized_size ||
1760			new_initialized_size > mft_ni->data_size)
1761		panic("%s(): new_initialized_size < mft_ni->initialized_size "
1762				"|| new_initialized_size > mft_ni->data_size\n",
1763				__FUNCTION__);
1764	mft_ni->initialized_size = new_initialized_size;
1765	lck_spin_unlock(&mft_ni->size_lock);
1766	err = buf_bdwrite(buf);
1767	if (!err) {
1768		ntfs_debug("Done.");
1769		return 0;
1770	}
1771	ntfs_error(vol->mp, "Failed to write buffer of mft record 0x%llx "
1772			"(error %d).  Run chkdsk.", (unsigned long long)mft_no,
1773			err);
1774	NVolSetErrors(vol);
1775	return err;
1776unmap:
1777	err2 = buf_unmap(buf);
1778	if (err2)
1779		ntfs_error(vol->mp, "Failed to unmap buffer of mft record "
1780				"0x%llx in error code path (error %d).",
1781				(unsigned long long)mft_no, err2);
1782brelse:
1783	buf_brelse(buf);
1784	return err;
1785}
1786
1787/**
1788 * ntfs_standard_info_attribute_insert - add the standard information attribute
1789 * @m:			mft record in which to insert the attribute
1790 * @a:			attribute in front of which to insert the new attribute
1791 * @file_attrs:		file attribute flags to set in the attribute
1792 * @security_id:	security_id to set in the attribute
1793 * @create_time:	time to use for the times in the attribute
1794 *
1795 * Insert the standard information attribute into the mft record @m in front of
1796 * the attribute record @a.
1797 *
1798 * If @security_id is not zero, insert a Win2k+ style standard information
1799 * attribute and if it is zero, insert an NT4 style one.
1800 *
1801 * This function cannot fail.
1802 */
1803static void ntfs_standard_info_attribute_insert(MFT_RECORD *m, ATTR_RECORD *a,
1804		const FILE_ATTR_FLAGS file_attrs, const le32 security_id,
1805		struct timespec *create_time)
1806{
1807	STANDARD_INFORMATION *si;
1808	u32 size;
1809
1810	ntfs_debug("Entering.");
1811	size = sizeof(STANDARD_INFORMATION);
1812	if (!security_id)
1813		size = offsetof(STANDARD_INFORMATION, reserved12) +
1814			sizeof(si->reserved12);
1815	/*
1816	 * Insert the attribute and initialize the value to zero.  This cannot
1817	 * fail as we are only called with an empty mft record so there must be
1818	 * enough space for the standard information attribute.
1819	 */
1820	if (ntfs_resident_attr_record_insert_internal(m, a,
1821			AT_STANDARD_INFORMATION, NULL, 0, size))
1822		panic("%s(): Failed to insert standard information "
1823				"attribute.\n", __FUNCTION__);
1824	/* Set up the attribute value. */
1825	si = (STANDARD_INFORMATION*)((u8*)a + le16_to_cpu(a->value_offset));
1826	si->last_access_time = si->last_mft_change_time =
1827			si->last_data_change_time = si->creation_time =
1828			utc2ntfs(*create_time);
1829	si->file_attributes = file_attrs;
1830	if (security_id)
1831		si->security_id = security_id;
1832	ntfs_debug("Done (used %s style standard information attribute).",
1833			security_id ? "Win2k+" : "NT4");
1834}
1835
1836/**
1837 * ntfs_sd_attribute_insert - add the security descriptor attribute
1838 * @vol:	volume to which the mft record belongs
1839 * @m:		mft record in which to insert the attribute
1840 * @a:		attribute in front of which to insert the new attribute
1841 * @va:		vnode attributes
1842 *
1843 * Insert the security descriptor attribute into the mft record @m in front of
1844 * the attribute record @a.
1845 *
1846 * @vol is the volume the mft record @m belongs to and is used to determine
1847 * whether an NT4 security descriptor is needed (NTFS 1.x) or a Win2k+ security
1848 * descriptor is needed (NTFS 3.0+).
1849 *
1850 * @va are the vnode attributes to assign to the create inode and allows us to
1851 * distinguish whether we need to insert a directory security descriptor or a
1852 * file one.
1853 *
1854 * This function cannot fail.
1855 */
1856static void ntfs_sd_attribute_insert(ntfs_volume *vol, MFT_RECORD *m,
1857		ATTR_RECORD *a, const struct vnode_attr *va)
1858{
1859	SDS_ENTRY *sds;
1860	u32 sd_size;
1861
1862	ntfs_debug("Entering.");
1863	if (vol->major_ver > 1) {
1864		if (va->va_type == VDIR)
1865			sds = ntfs_dir_sds_entry;
1866		else
1867			sds = ntfs_file_sds_entry;
1868	} else {
1869		if (va->va_type == VDIR)
1870			sds = ntfs_dir_sds_entry_old;
1871		else
1872			sds = ntfs_file_sds_entry_old;
1873	}
1874	sd_size = le32_to_cpu(sds->length) - sizeof(SDS_ENTRY_HEADER);
1875	/*
1876	 * Insert the attribute.  This cannot fail as we are only called with
1877	 * an empty mft record so there must be enough space for our default
1878	 * security descriptor attribute which is tiny.
1879	 */
1880	if (ntfs_resident_attr_record_insert_internal(m, a,
1881			AT_SECURITY_DESCRIPTOR, NULL, 0, sd_size))
1882		panic("%s(): Failed to insert security descriptor "
1883				"attribute.\n", __FUNCTION__);
1884	/* Copy the chosen security descriptor into place. */
1885	memcpy((u8*)a + le16_to_cpu(a->value_offset), &sds->sd, sd_size);
1886	ntfs_debug("Done.");
1887}
1888
1889/**
1890 * ntfs_index_root_attribute_insert - add the empty, $I30 index root attribute
1891 * @vol:	volume to which the mft record belongs
1892 * @m:		mft record in which to insert the attribute
1893 * @a:		attribute in front of which to insert the new attribute
1894 *
1895 * Insert the empty, $I30 index root attribute into the mft record @m in front
1896 * of the attribute record @a.
1897 *
1898 * @vol is the volume the mft record @m belongs to and is used to determine the
1899 * the index block size as well as the number of clusters per index block.
1900 *
1901 * This function cannot fail.
1902 */
1903static void ntfs_index_root_attribute_insert(ntfs_volume *vol, MFT_RECORD *m,
1904		ATTR_RECORD *a)
1905{
1906	INDEX_ROOT *ir;
1907	INDEX_ENTRY_HEADER *ieh;
1908
1909	ntfs_debug("Entering.");
1910	/*
1911	 * Insert the attribute and initialize the value to zero.  This cannot
1912	 * fail as we are only called with an empty mft record so there must be
1913	 * enough space for the empty index root attribute.
1914	 */
1915	if (ntfs_resident_attr_record_insert_internal(m, a, AT_INDEX_ROOT, I30,
1916			4, sizeof(INDEX_ROOT) + sizeof(INDEX_ENTRY_HEADER)))
1917		panic("%s(): Failed to insert index root attribute.\n",
1918				__FUNCTION__);
1919	/* Set up the attribute value. */
1920	ir = (INDEX_ROOT*)((u8*)a + le16_to_cpu(a->value_offset));
1921	ir->type = AT_FILENAME;
1922	ir->collation_rule = COLLATION_FILENAME;
1923	ir->index_block_size = cpu_to_le32(vol->index_block_size);
1924	ir->blocks_per_index_block = vol->blocks_per_index_block;
1925	ir->index.entries_offset = const_cpu_to_le32(sizeof(INDEX_HEADER));
1926	ir->index.allocated_size = ir->index.index_length = const_cpu_to_le32(
1927			sizeof(INDEX_HEADER) + sizeof(INDEX_ENTRY_HEADER));
1928	/* SMALL_INDEX is zero and the attribute value is already zeroed. */
1929	/* ir->index.flags = SMALL_INDEX; */
1930	ieh = (INDEX_ENTRY_HEADER*)((u8*)ir + sizeof(INDEX_ROOT));
1931	ieh->length = const_cpu_to_le16(sizeof(INDEX_ENTRY_HEADER));
1932	ieh->flags = INDEX_ENTRY_END;
1933	ntfs_debug("Done.");
1934}
1935
1936/**
1937 * ntfs_mft_record_alloc - allocate an mft record on an ntfs volume
1938 * @vol:	[IN]  volume on which to allocate the mft record
1939 * @va:		[IN/OUT] vnode attributes to assign to the new inode or NULL
1940 * @cn:		[IN]  name of new inode (@va != NULL) or NULL (@va == NULL)
1941 * @base_ni:	[IN]  base inode (@va == NULL) or parent directory (@va != NULL)
1942 * @new_ni:	[OUT] on success this is the ntfs inode of the created inode
1943 * @new_m:	[OUT] on success this is the mapped mft record
1944 * @new_a:	[OUT] on success this is the attribute at which to insert
1945 *
1946 * Allocate an mft record in $MFT/$DATA of an open ntfs volume @vol and return
1947 * the ntfs inode of the created inode in *@new_ni, its mft record in *@new_m,
1948 * and *@new_a poinst to the attribute record in front of which the filename
1949 * attribute needs be inserted (if @va was not NULL, i.e. we allocated a base
1950 * mft record for a file or directory) or to the position at which the first
1951 * attribute in this mft record needs to be inserted (if @va is NULL, i.e. we
1952 * allocate an extent mft record).
1953 *
1954 * If @va is not NULL make the mft record a base mft record, i.e. a file or
1955 * directory inode, and allocate it at the default allocator position.  In this
1956 * case @va are the vnode attributes as given to us by the caller, @base_ni is
1957 * is the ntfs inode of the parent directory, and @cn is the name of the new
1958 * inode.
1959 *
1960 * When allocating a base mft record the caller needs to do an
1961 *	ntfs_inode_unlock_alloc(*@new_ni);
1962 * to make the inode a full member of society by unlocking it and waking up any
1963 * waiters.  We do not do it here as the caller is likely to want to do more
1964 * work before unlocking the inode.
1965 *
1966 * Note that we only support some of the attributes that can be specified in
1967 * @va and we update @va to reflect the values we actually end up using.
1968 *
1969 * We in particular use @va to distinguish what type of inode is being created
1970 * (@va->va_type == VREG, VDIR, VLNK, VSOCK, VFIFO, VBLK, or VCHR,
1971 * respectively).  @va also gives us the creation_time to use
1972 * (@va->va_create_time) as well as the mode (@va->va_mode) and the file
1973 * attributes (@va->va_flags).  And for block and character device special file
1974 * nodes @va->va_rdev specifies the device.
1975 *
1976 * If @va is NULL, make the allocated mft record an extent record, allocate it
1977 * starting at the mft record after the base mft record and attach the
1978 * allocated and opened ntfs inode to the base inode @base_ni.  @cn is NULL.
1979 *
1980 * When allocating a base mft record, add the standard information attribute,
1981 * the security descriptor attribute (if needed) as well as the empty data
1982 * attribute (@va->va_type == VREG or VLNK), the empty index root attribute
1983 * (@va->va_type == VDIR) or the special flags and attributes for special
1984 * inodes (@va->va_type == VSOCK, VFIFO, VBLK, or VCHR).
1985 *
1986 * Return 0 on success and errno on error.  On error *@new_ni, *@new_m, and
1987 * *@new_a are not defined.
1988 *
1989 * Allocation strategy:
1990 *
1991 * To find a free mft record, we scan the mft bitmap for a zero bit.  To
1992 * optimize this we start scanning at the place specified by @base_ni or if
1993 * @base_ni is NULL we start where we last stopped and we perform wrap around
1994 * when we reach the end.  Note, we do not try to allocate mft records below
1995 * number 24 because numbers 0 to 15 are the defined system files anyway and 16
1996 * to 24 are special in that they are used for storing extension mft records
1997 * for the $DATA attribute of $MFT.  This is required to avoid the possibility
1998 * of creating a runlist with a circular dependency which once written to disk
1999 * can never be read in again.  Windows will only use records 16 to 24 for
2000 * normal files if the volume is completely out of space.  We never use them
2001 * which means that when the volume is really out of space we cannot create any
2002 * more files while Windows can still create up to 8 small files.  We can start
2003 * doing this at some later time, it does not matter much for now.
2004 *
2005 * When scanning the mft bitmap, we only search up to the last allocated mft
2006 * record.  If there are no free records left in the range 24 to number of
2007 * allocated mft records, then we extend the $MFT/$DATA attribute in order to
2008 * create free mft records.  We extend the allocated size of $MFT/$DATA by 16
2009 * records at a time or one cluster, if cluster size is above 16kiB.  If there
2010 * is not sufficient space to do this, we try to extend by a single mft record
2011 * or one cluster, if cluster size is above the mft record size.
2012 *
2013 * No matter how many mft records we allocate, we initialize only the first
2014 * allocated mft record, incrementing mft data size and initialized size
2015 * accordingly, open an ntfs_inode for it and return it to the caller, unless
2016 * there are less than 24 mft records, in which case we allocate and initialize
2017 * mft records until we reach record 24 which we consider as the first free mft
2018 * record for use by normal files.
2019 *
2020 * If during any stage we overflow the initialized data in the mft bitmap, we
2021 * extend the initialized size (and data size) by 8 bytes, allocating another
2022 * cluster if required.  The bitmap data size has to be at least equal to the
2023 * number of mft records in the mft, but it can be bigger, in which case the
2024 * superflous bits are padded with zeroes.
2025 *
2026 * Thus, when we return success (i.e. zero), we will have:
2027 *	- initialized / extended the mft bitmap if necessary,
2028 *	- initialized / extended the mft data if necessary,
2029 *	- set the bit corresponding to the mft record being allocated in the
2030 *	  mft bitmap,
2031 *	- opened an ntfs_inode for the allocated mft record, and we will have
2032 *	- returned the ntfs_inode as well as the allocated and mapped mft
2033 *	  record.
2034 *
2035 * On error, the volume will be left in a consistent state and no record will
2036 * be allocated.  If rolling back a partial operation fails, we may leave some
2037 * inconsistent metadata in which case we set NVolErrors() so the volume is
2038 * left dirty when unmounted.
2039 *
2040 * Note, this function cannot make use of most of the normal functions, like
2041 * for example for attribute resizing, etc, because when the run list overflows
2042 * the base mft record and an attribute list is used, it is very important that
2043 * the extension mft records used to store the $DATA attribute of $MFT can be
2044 * reached without having to read the information contained inside them, as
2045 * this would make it impossible to find them in the first place after the
2046 * volume is unmounted.  $MFT/$BITMAP probably does not need to follow this
2047 * rule because the bitmap is not essential for finding the mft records, but on
2048 * the other hand, handling the bitmap in this special way would make life
2049 * easier because otherwise there might be circular invocations of functions
2050 * when reading the bitmap.
2051 */
2052errno_t ntfs_mft_record_alloc(ntfs_volume *vol, struct vnode_attr *va,
2053		struct componentname *cn, ntfs_inode *base_ni,
2054		ntfs_inode **new_ni, MFT_RECORD **new_m,
2055		ATTR_RECORD **new_a)
2056{
2057	s64 bit, ll, old_data_initialized, old_data_size, old_mft_data_pos;
2058	s64 nr_mft_records_added;
2059	ntfs_inode *mft_ni, *mftbmp_ni, *ni;
2060	MFT_RECORD *m;
2061	ntfs_attr_search_ctx *ctx;
2062	ATTR_RECORD *a;
2063	buf_t buf;
2064	errno_t err, err2;
2065	le16 seq_no, usn;
2066	BOOL record_formatted, mark_sizes_dirty, dirty_buf;
2067	BOOL mft_ni_write_locked;
2068
2069	ntfs_debug("Entering (allocating a%s mft record, %s 0x%llx).",
2070			va ? " base" : "n extent",
2071			va ? "parent directory" : "base mft record",
2072			(unsigned long long)base_ni->mft_no);
2073	if (!new_ni || !new_m || !new_a)
2074		panic("%s(): !new_ni || !new_m || !new_a\n", __FUNCTION__);
2075	if (!base_ni)
2076		panic("%s(): !base_ni\n", __FUNCTION__);
2077	lck_rw_lock_exclusive(&vol->mftbmp_lock);
2078	/*
2079	 * Get an iocount reference on the mft and mftbmp vnodes.
2080	 *
2081	 * We do not bother with the iocount reference on the mft if @va is
2082	 * NULL, i.e. we are allocating an extent mft record, because in that
2083	 * case the base mft record @ni is already mapped thus an iocount
2084	 * reference is already held on the mft.
2085	 */
2086	mft_ni = vol->mft_ni;
2087	if (va) {
2088		err = vnode_get(mft_ni->vn);
2089		if (err) {
2090			ntfs_error(vol->mp, "Failed to get vnode for $MFT.");
2091			lck_rw_unlock_exclusive(&vol->mftbmp_lock);
2092			return err;
2093		}
2094	}
2095	mftbmp_ni = vol->mftbmp_ni;
2096	err = vnode_get(mftbmp_ni->vn);
2097	if (err) {
2098		ntfs_error(vol->mp, "Failed to get vnode for $MFT/$Bitmap.");
2099		if (va)
2100			(void)vnode_put(mft_ni->vn);
2101		lck_rw_unlock_exclusive(&vol->mftbmp_lock);
2102		return err;
2103	}
2104retry_mftbmp_alloc:
2105	record_formatted = mark_sizes_dirty = dirty_buf = FALSE;
2106	lck_rw_lock_exclusive(&mftbmp_ni->lock);
2107	err = ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(vol,
2108			va ? NULL : base_ni, &bit);
2109	if (!err) {
2110		ntfs_debug("Found and allocated free record (#1), bit 0x%llx.",
2111				(unsigned long long)bit);
2112		goto have_alloc_rec;
2113	}
2114	if (err != ENOSPC)
2115		goto unl_err;
2116	/*
2117	 * No free mft records left.  If the mft bitmap already covers more
2118	 * than the currently used mft records, the next records are all free,
2119	 * so we can simply allocate the first unused mft record.
2120	 *
2121	 * Note: We also have to make sure that the mft bitmap at least covers
2122	 * the first 24 mft records as they are special and whilst they may not
2123	 * be in use, we do not allocate from them.
2124	 */
2125	lck_spin_lock(&mft_ni->size_lock);
2126	ll = mft_ni->initialized_size >> vol->mft_record_size_shift;
2127	lck_spin_unlock(&mft_ni->size_lock);
2128	lck_spin_lock(&mftbmp_ni->size_lock);
2129	old_data_initialized = mftbmp_ni->initialized_size;
2130	lck_spin_unlock(&mftbmp_ni->size_lock);
2131	if (old_data_initialized << 3 > ll && old_data_initialized > 3) {
2132		bit = ll;
2133		if (bit < 24)
2134			bit = 24;
2135		/*
2136		 * To be in line with what Windows allows we restrict the total
2137		 * number of mft records to 2^32.
2138		 */
2139		if (bit >= (1LL << 32))
2140			goto max_err;
2141		ntfs_debug("Found free record (#2), bit 0x%llx.",
2142				(unsigned long long)bit);
2143		goto found_free_rec;
2144	}
2145	/*
2146	 * The mft bitmap needs to be extended until it covers the first unused
2147	 * mft record that we can allocate.
2148	 *
2149	 * Note: The smallest mft record we allocate is mft record 24.
2150	 */
2151	bit = old_data_initialized << 3;
2152	/*
2153	 * To be in line with what Windows allows we restrict the total number
2154	 * of mft records to 2^32.
2155	 */
2156	if (bit >= (1LL << 32))
2157		goto max_err;
2158	lck_spin_lock(&mftbmp_ni->size_lock);
2159	old_data_size = mftbmp_ni->allocated_size;
2160	ntfs_debug("Status of mftbmp before extension: allocated_size 0x%llx, "
2161			"data_size 0x%llx, initialized_size 0x%llx.",
2162			(unsigned long long)old_data_size,
2163			(unsigned long long)mftbmp_ni->data_size,
2164			(unsigned long long)old_data_initialized);
2165	lck_spin_unlock(&mftbmp_ni->size_lock);
2166	if (old_data_initialized + 8 > old_data_size) {
2167		/* Need to extend bitmap by one more cluster. */
2168		ntfs_debug("mftbmp: initialized_size + 8 > allocated_size.");
2169		err = ntfs_mft_bitmap_extend_allocation_nolock(vol);
2170		if (err)
2171			goto unl_err;
2172#ifdef DEBUG
2173		lck_spin_lock(&mftbmp_ni->size_lock);
2174		ntfs_debug("Status of mftbmp after allocation extension: "
2175				"allocated_size 0x%llx, data_size 0x%llx, "
2176				"initialized_size 0x%llx.",
2177				(unsigned long long)mftbmp_ni->allocated_size,
2178				(unsigned long long)mftbmp_ni->data_size,
2179				(unsigned long long)
2180				mftbmp_ni->initialized_size);
2181		lck_spin_unlock(&mftbmp_ni->size_lock);
2182#endif /* DEBUG */
2183	}
2184	/*
2185	 * We now have sufficient allocated space, extend the initialized_size
2186	 * as well as the data_size if necessary and fill the new space with
2187	 * zeroes.
2188	 */
2189	err = ntfs_mft_bitmap_extend_initialized_nolock(vol);
2190	if (err)
2191		goto unl_err;
2192#ifdef DEBUG
2193	lck_spin_lock(&mftbmp_ni->size_lock);
2194	ntfs_debug("Status of mftbmp after initialized extension: "
2195			"allocated_size 0x%llx, data_size 0x%llx, "
2196			"initialized_size 0x%llx.",
2197			(unsigned long long)mftbmp_ni->allocated_size,
2198			(unsigned long long)mftbmp_ni->data_size,
2199			(unsigned long long)mftbmp_ni->initialized_size);
2200	lck_spin_unlock(&mftbmp_ni->size_lock);
2201#endif /* DEBUG */
2202	ntfs_debug("Found free record (#3), bit 0x%llx.",
2203			(unsigned long long)bit);
2204found_free_rec:
2205	/* @bit is the found free mft record, allocate it in the mft bitmap. */
2206	ntfs_debug("At found_free_rec.");
2207	err = ntfs_bitmap_set_bit(mftbmp_ni, bit);
2208	if (err) {
2209		ntfs_error(vol->mp, "Failed to allocate bit in mft bitmap.");
2210		goto unl_err;
2211	}
2212	ntfs_debug("Set bit 0x%llx in mft bitmap.", (unsigned long long)bit);
2213have_alloc_rec:
2214	lck_rw_unlock_exclusive(&mftbmp_ni->lock);
2215	/*
2216	 * The mft bitmap is now uptodate.  Deal with mft data attribute now.
2217	 * Note, we keep hold of the mft bitmap lock for writing until all
2218	 * modifications to the mft data attribute are complete, too, as they
2219	 * will impact decisions for mft bitmap and mft record allocation done
2220	 * by a parallel allocation and if the lock is not maintained a
2221	 * parallel allocation could decide to allocate the same mft record as
2222	 * this one.
2223	 */
2224	lck_rw_lock_shared(&mft_ni->lock);
2225	mft_ni_write_locked = FALSE;
2226mft_relocked:
2227	ll = (bit + 1) << vol->mft_record_size_shift;
2228	lck_spin_lock(&mft_ni->size_lock);
2229	old_data_initialized = mft_ni->initialized_size;
2230	lck_spin_unlock(&mft_ni->size_lock);
2231	if (ll <= old_data_initialized) {
2232		ntfs_debug("Allocated mft record already initialized.");
2233		goto mft_rec_already_initialized;
2234	}
2235	if (!mft_ni_write_locked) {
2236		mft_ni_write_locked = TRUE;
2237		if (!lck_rw_lock_shared_to_exclusive(&mft_ni->lock)) {
2238			lck_rw_lock_exclusive(&mft_ni->lock);
2239			goto mft_relocked;
2240		}
2241	}
2242	ntfs_debug("Initializing allocated mft record.");
2243	/*
2244	 * The mft record is outside the initialized data.  Extend the mft data
2245	 * attribute until it covers the allocated record.  The loop is only
2246	 * actually traversed more than once when a freshly formatted volume is
2247	 * first written to so it optimizes away nicely in the common case.
2248	 */
2249	lck_spin_lock(&mft_ni->size_lock);
2250	ntfs_debug("Status of mft data before extension: "
2251			"allocated_size 0x%llx, data_size 0x%llx, "
2252			"initialized_size 0x%llx.",
2253			(unsigned long long)mft_ni->allocated_size,
2254			(unsigned long long)mft_ni->data_size,
2255			(unsigned long long)mft_ni->initialized_size);
2256	while (ll > mft_ni->allocated_size) {
2257		lck_spin_unlock(&mft_ni->size_lock);
2258		err = ntfs_mft_data_extend_allocation_nolock(vol);
2259		if (err) {
2260			ntfs_error(vol->mp, "Failed to extend mft data "
2261					"allocation.");
2262			lck_rw_unlock_exclusive(&mft_ni->lock);
2263			goto undo_mftbmp_alloc_locked;
2264		}
2265		lck_spin_lock(&mft_ni->size_lock);
2266		ntfs_debug("Status of mft data after allocation extension: "
2267				"allocated_size 0x%llx, data_size 0x%llx, "
2268				"initialized_size 0x%llx.",
2269				(unsigned long long)mft_ni->allocated_size,
2270				(unsigned long long)mft_ni->data_size,
2271				(unsigned long long)mft_ni->initialized_size);
2272	}
2273	lck_spin_unlock(&mft_ni->size_lock);
2274	/*
2275	 * Extend mft data initialized size (and data size of course) to reach
2276	 * the allocated mft record, formatting the mft records allong the way.
2277	 *
2278	 * Note: We only modify the ntfs_inode structure as that is all that is
2279	 * needed by ntfs_mft_record_format().  We will update the attribute
2280	 * record itself in one fell swoop later on.
2281	 */
2282	lck_spin_lock(&mft_ni->size_lock);
2283	old_data_initialized = mft_ni->initialized_size;
2284	old_data_size = mft_ni->data_size;
2285	nr_mft_records_added = 0;
2286	if (old_data_size != ubc_getsize(mft_ni->vn))
2287		panic("%s(): old_data_size != ubc_getsize(mft_ni->vn)\n",
2288				__FUNCTION__);
2289	while (ll > mft_ni->initialized_size) {
2290		s64 new_initialized_size, mft_no;
2291
2292		new_initialized_size = mft_ni->initialized_size +
2293				vol->mft_record_size;
2294		mft_no = mft_ni->initialized_size >> vol->mft_record_size_shift;
2295		ntfs_debug("mft_no 0x%llx, new_initialized_size 0x%llx, "
2296				"initialized_size 0x%llx, data_size 0x%llx.",
2297				(unsigned long long)mft_no,
2298				(unsigned long long)new_initialized_size,
2299				(unsigned long long)mft_ni->initialized_size,
2300				(unsigned long long)mft_ni->data_size);
2301		if (new_initialized_size > mft_ni->data_size) {
2302			/* Increment the number of newly added mft records. */
2303			nr_mft_records_added += (new_initialized_size -
2304					mft_ni->data_size) >>
2305					vol->mft_record_size_shift;
2306			ntfs_debug("Updating data size and ubc size, "
2307					"nr_mft_records_added %lld.",
2308					(long long)nr_mft_records_added);
2309			mft_ni->data_size = new_initialized_size;
2310			lck_spin_unlock(&mft_ni->size_lock);
2311			if (!ubc_setsize(mft_ni->vn, new_initialized_size))
2312				panic("%s(): ubc_setsize() failed.\n",
2313						__FUNCTION__);
2314			mark_sizes_dirty = TRUE;
2315		} else
2316			lck_spin_unlock(&mft_ni->size_lock);
2317		ntfs_debug("Initializing mft record 0x%llx.",
2318				(unsigned long long)mft_no);
2319		/*
2320		 * ntfs_mft_record_format() updates the initialized size in
2321		 * @mft_ni.
2322		 */
2323		err = ntfs_mft_record_format(vol, mft_no, new_initialized_size);
2324		if (err) {
2325			ntfs_error(vol->mp, "Failed to format mft record.");
2326			goto undo_data_init;
2327		}
2328		lck_spin_lock(&mft_ni->size_lock);
2329	}
2330	lck_spin_unlock(&mft_ni->size_lock);
2331	record_formatted = TRUE;
2332	/*
2333	 * Update the mft data attribute record to reflect the new sizes.
2334	 *
2335	 * When mapping the mft record for the mft we communicate the fact that
2336	 * we hold the lock on the mft inode @mft_ni->lock for writing so it
2337	 * does not try to take the lock.
2338	 */
2339	err = ntfs_mft_record_map_ext(mft_ni, &m, TRUE);
2340	if (err) {
2341		ntfs_error(vol->mp, "Failed to map mft record.");
2342		goto undo_data_init;
2343	}
2344	ctx = ntfs_attr_search_ctx_get(mft_ni, m);
2345	if (!ctx) {
2346		ntfs_error(vol->mp, "Failed to get search context.");
2347		err = ENOMEM;
2348		ntfs_mft_record_unmap(mft_ni);
2349		goto undo_data_init;
2350	}
2351	/*
2352	 * We have the mft lock taken for write.  Communicate this fact to
2353	 * ntfs_attr_lookup() and hence to ntfs_extent_mft_record_map_ext() and
2354	 * ntfs_mft_record_map_ext() so that they know not to try to take the
2355	 * same lock.
2356	 */
2357	ctx->is_mft_locked = 1;
2358	err = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
2359			0, NULL, 0, ctx);
2360	if (err) {
2361		ntfs_error(vol->mp, "Failed to find first attribute extent of "
2362				"mft data attribute.");
2363		ntfs_attr_search_ctx_put(ctx);
2364		ntfs_mft_record_unmap(mft_ni);
2365		goto undo_data_init;
2366	}
2367	a = ctx->a;
2368	lck_spin_lock(&mft_ni->size_lock);
2369	a->initialized_size = cpu_to_sle64(mft_ni->initialized_size);
2370	a->data_size = cpu_to_sle64(mft_ni->data_size);
2371	/*
2372	 * We have created new mft records thus update the cached numbers of
2373	 * total and free mft records to reflect this.
2374	 */
2375	vol->nr_mft_records = mft_ni->data_size >> vol->mft_record_size_shift;
2376	vol->nr_free_mft_records += nr_mft_records_added;
2377	if (vol->nr_free_mft_records >= vol->nr_mft_records)
2378		panic("%s(): vol->nr_free_mft_records > vol->nr_mft_records\n",
2379				__FUNCTION__);
2380	lck_spin_unlock(&mft_ni->size_lock);
2381	/* Ensure the changes make it to disk. */
2382	NInoSetMrecNeedsDirtying(ctx->ni);
2383	ntfs_attr_search_ctx_put(ctx);
2384	ntfs_mft_record_unmap(mft_ni);
2385	/*
2386	 * If we have modified the size of the base inode, cause the sizes to
2387	 * be written to all the directory index entries pointing to the base
2388	 * inode when the inode is written to disk.
2389	 */
2390	if (mark_sizes_dirty)
2391		NInoSetDirtySizes(mft_ni);
2392	lck_spin_lock(&mft_ni->size_lock);
2393	ntfs_debug("Status of mft data after mft record initialization: "
2394			"allocated_size 0x%llx, data_size 0x%llx, "
2395			"initialized_size 0x%llx.",
2396			(unsigned long long)mft_ni->allocated_size,
2397			(unsigned long long)mft_ni->data_size,
2398			(unsigned long long)mft_ni->initialized_size);
2399	if (mft_ni->data_size != ubc_getsize(mft_ni->vn))
2400		panic("%s(): mft_ni->data_size != ubc_getsize(mft_ni->vn)\n",
2401				__FUNCTION__);
2402	if (mft_ni->data_size > mft_ni->allocated_size)
2403		panic("%s(): mft_ni->data_size > mft_ni->allocated_size\n",
2404				__FUNCTION__);
2405	if (mft_ni->initialized_size > mft_ni->data_size)
2406		panic("%s(): mft_ni->initialized_size > mft_ni->data_size\n",
2407				__FUNCTION__);
2408	lck_spin_unlock(&mft_ni->size_lock);
2409	lck_rw_lock_exclusive_to_shared(&mft_ni->lock);
2410mft_rec_already_initialized:
2411	/*
2412	 * Update the default mft allocation position.  We have to do this now
2413	 * even if we fail later and deallocate the mft record because we are
2414	 * about to drop the mftbmp_lock so we cannot touch vol->mft_data_pos
2415	 * later on.  We save the old value so we can restore it on error.
2416	 */
2417	old_mft_data_pos = vol->mft_data_pos;
2418	vol->mft_data_pos = bit + 1;
2419	/*
2420	 * We have allocated an mft record thus decrement the cached number of
2421	 * free mft records to reflect this.
2422	 */
2423	vol->nr_free_mft_records--;
2424	if (vol->nr_free_mft_records < 0)
2425		vol->nr_free_mft_records = 0;
2426	/*
2427	 * We can finally drop the mft bitmap lock as the mft data attribute
2428	 * has been fully updated.  The only disparity left is that the
2429	 * allocated mft record still needs to be marked as in use to match the
2430	 * set bit in the mft bitmap but this is actually not a problem since
2431	 * this mft record is not referenced from anywhere yet and the fact
2432	 * that it is allocated in the mft bitmap means that no-one will try to
2433	 * allocate it either.
2434	 */
2435	lck_rw_unlock_exclusive(&vol->mftbmp_lock);
2436	/*
2437	 * We now have allocated and initialized the mft record.
2438	 *
2439	 * Read and map the buffer containing the mft record.
2440	 */
2441	err = buf_meta_bread(mft_ni->vn, bit, vol->mft_record_size, NOCRED,
2442			&buf);
2443	if (err) {
2444		ntfs_error(vol->mp, "Failed to read buffer of mft record "
2445				"0x%llx (error %d).", (unsigned long long)bit,
2446				err);
2447		goto undo_mftbmp_alloc;
2448	}
2449	err = buf_map(buf, (caddr_t*)&m);
2450	if (err) {
2451		ntfs_error(vol->mp, "Failed to map buffer of mft record "
2452				"0x%llx (error %d).", (unsigned long long)bit,
2453				err);
2454		goto undo_mftbmp_alloc;
2455	}
2456	/* If we just formatted the mft record no need to do it again. */
2457	if (!record_formatted) {
2458		/*
2459		 * Sanity check that the mft record is really not in use.  If
2460		 * it is in use then warn the user about this inconsistency,
2461		 * mark the volume as dirty to force chkdsk to run, and try to
2462		 * allocate another mft record.  As we have already set the mft
2463		 * bitmap bit this means we have "repaired" the inconsistency.
2464		 * Of course we may now have an mft record that is marked in
2465		 * use correctly but that is not referenced from anywhere at
2466		 * all but chkdsk should hopefully fix this case by either
2467		 * recovering the mft record by linking it somewhere or by
2468		 * properly freeing the mft record.
2469		 *
2470		 * TODO: Need to test what chkdsk does exactly.  For example if
2471		 * it only clears the bit in the mft bitmap but leaves the mft
2472		 * record marked in use we would detect this here as corruption
2473		 * again and set the bitmap bit back to one and thus end up
2474		 * with a vicious circle.  So we need to figure out what chkdsk
2475		 * does and adjust our handling here appropriately.
2476		 */
2477		if (ntfs_is_file_record(m->magic) &&
2478				m->flags & MFT_RECORD_IN_USE) {
2479			ntfs_warning(vol->mp, "Mft record 0x%llx was marked "
2480					"free in mft bitmap but is marked "
2481					"used itself.  Marking it used in mft "
2482					"bitmap.  This indicates a corrupt "
2483					"file system.  Unmount and run "
2484					"chkdsk.", (unsigned long long)bit);
2485			err = buf_unmap(buf);
2486			if (err)
2487				ntfs_error(vol->mp, "Failed to unmap buffer "
2488						"of mft record 0x%llx (error "
2489						"%d).",
2490						(unsigned long long)bit, err);
2491			buf_brelse(buf);
2492			lck_rw_unlock_shared(&mft_ni->lock);
2493			lck_rw_lock_exclusive(&vol->mftbmp_lock);
2494			NVolSetErrors(vol);
2495			goto retry_mftbmp_alloc;
2496		}
2497		/*
2498		 * We need to (re-)format the mft record, preserving the
2499		 * sequence number if it is not zero as well as the update
2500		 * sequence number if it is not zero or -1 (0xffff).  This
2501		 * means we do not need to care whether or not something went
2502		 * wrong with the previous mft record.
2503		 */
2504		seq_no = m->sequence_number;
2505		usn = 0;
2506		if (le16_to_cpu(m->usa_ofs) < NTFS_BLOCK_SIZE - sizeof(u16))
2507			usn = *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs));
2508		err = ntfs_mft_record_lay_out(vol, bit, m);
2509		if (err) {
2510			ntfs_error(vol->mp, "Failed to lay out allocated mft "
2511					"record 0x%llx.",
2512					(unsigned long long)bit);
2513			goto unmap_undo_mftbmp_alloc;
2514		}
2515		if (seq_no)
2516			m->sequence_number = seq_no;
2517		if (usn && usn != 0xffff)
2518			*(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = usn;
2519	}
2520	/* Set the mft record itself in use. */
2521	m->flags |= MFT_RECORD_IN_USE;
2522	if (!va) {
2523		/*
2524		 * Record the sequence number so we can supply it as part of
2525		 * the mft reference when mapping the extent mft record below
2526		 * which ensures that we get back the same mft record we
2527		 * expected.
2528		 */
2529		seq_no = m->sequence_number;
2530		/*
2531		 * Setup the base mft record in the extent mft record.  This
2532		 * completes initialization of the allocated extent mft record
2533		 * and we can simply use it with ntfs_extent_mft_record_map().
2534		 */
2535		m->base_mft_record = MK_LE_MREF(base_ni->mft_no,
2536				base_ni->seq_no);
2537		/*
2538		 * Need to release the page so that we can call
2539		 * ntfs_extent_mft_record_map().  We also set the page dirty to
2540		 * ensure that it does not get thrown out under VM pressure
2541		 * before we get it with the ntfs_extent_mft_record_map() call.
2542		 *
2543		 * FIXME: This could be optimized by modifying
2544		 * ntfs_extent_mft_record_map() to take an optional mft record,
2545		 * i.e. @m, and if supplied using this instead of trying to map
2546		 * the extent mft record.   Alternatively we could unlock the
2547		 * page but not release it but this cannot be done in OS X
2548		 * (yet).
2549		 *
2550		 * Allocate an extent inode structure for the new mft record,
2551		 * attach it to the base inode @base_ni and map its, i.e. the
2552		 * allocated, mft record.
2553		 */
2554		err = buf_unmap(buf);
2555		if (err)
2556			ntfs_error(vol->mp, "Failed to unmap buffer of mft "
2557					"record 0x%llx (error %d).",
2558					(unsigned long long)bit, err);
2559		err = buf_bdwrite(buf);
2560		if (err) {
2561			ntfs_error(vol->mp, "Failed to write buffer of mft "
2562					"record 0x%llx (error %d).  Run "
2563					"chkdsk.", (unsigned long long)bit,
2564					err);
2565			NVolSetErrors(vol);
2566			lck_rw_unlock_shared(&mft_ni->lock);
2567			goto free_undo_mftbmp_alloc;
2568		}
2569		err = ntfs_extent_mft_record_map_ext(base_ni, MK_MREF(bit,
2570				le16_to_cpu(seq_no)), &ni, &m, TRUE);
2571		lck_rw_unlock_shared(&mft_ni->lock);
2572		if (err) {
2573			ntfs_error(vol->mp, "Failed to map allocated mft "
2574					"record 0x%llx (error %d).",
2575					(unsigned long long)bit, err);
2576			goto free_undo_mftbmp_alloc;
2577		}
2578		/* This is where the first attribute needs to be inserted. */
2579		*new_a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset));
2580	} else {
2581		FILE_ATTR_FLAGS file_attrs;
2582		le32 security_id;
2583		ntfs_attr na;
2584
2585		/*
2586		 * Mirror the file attribute flags we want to inherit from the
2587		 * parent directory.
2588		 */
2589		file_attrs = base_ni->file_attributes & (FILE_ATTR_ENCRYPTED |
2590				FILE_ATTR_NOT_CONTENT_INDEXED |
2591				FILE_ATTR_COMPRESSED | FILE_ATTR_SPARSE_FILE);
2592		switch (va->va_type) {
2593		case VDIR:
2594			m->flags |= MFT_RECORD_IS_DIRECTORY;
2595			break;
2596		case VSOCK:
2597		case VFIFO:
2598		case VBLK:
2599		case VCHR:
2600			/*
2601			 * We use the same way of implementing special inodes
2602			 * as Services For Unix uses on Windows thus we set the
2603			 * FILE_ATTR_SYSTEM file attribute.
2604			 */
2605			file_attrs |= FILE_ATTR_SYSTEM;
2606			/*
2607			 * It makes no sense for a special inode to be
2608			 * encrypted or compressed so clear those flags.
2609			 */
2610			file_attrs &= ~(FILE_ATTR_ENCRYPTED |
2611					FILE_ATTR_COMPRESSED);
2612		default:
2613			file_attrs |= FILE_ATTR_ARCHIVE;
2614			/*
2615			 * FIXME: We do not implement writing to compressed or
2616			 * encrypted files yet, so we clear the corresponding
2617			 * bits in the file attribute flags for now.
2618			 */
2619			file_attrs &= ~(FILE_ATTR_ENCRYPTED |
2620					FILE_ATTR_COMPRESSED);
2621		}
2622		/*
2623		 * Determine whether we need to insert a Win2k+ style standard
2624		 * information attribute or an NT4 style one.  For NTFS 1.x
2625		 * volumes, we always insert NT4 style standard information
2626		 * attributes whilst for newer volumes we decide depending on
2627		 * the value of NVolUseSDAttr().  If NVolUseSDAttr() is set, we
2628		 * are to specify security descriptors by creating security
2629		 * descriptor attributes and in this case we have to use the
2630		 * NT4 style standard information attribute.  If it is clear,
2631		 * we are to specify security descriptors by security_id
2632		 * reference into $Secure system file and in this case we have
2633		 * to use the Win2k+ style standard information attribute.
2634		 *
2635		 * To make things simpler, if this is an NTFS 1.x volume,
2636		 * NVolUseSDAttr() has been set so we only need to test for it.
2637		 */
2638		if (NVolUseSDAttr(vol))
2639			security_id = 0;
2640		else {
2641			BOOL is_retry = FALSE;
2642retry:
2643			lck_spin_lock(&vol->security_id_lock);
2644			if (va->va_type == VDIR)
2645				security_id = vol->default_dir_security_id;
2646			else
2647				security_id = vol->default_file_security_id;
2648			lck_spin_unlock(&vol->security_id_lock);
2649			/*
2650			 * If the default security_id is not initialized, try
2651			 * to initialize it now and should the initialization
2652			 * fail, use a security descriptor attribute and hence
2653			 * an NT4 style standard information attribute.
2654			 */
2655			if (!security_id && !is_retry) {
2656				if (!ntfs_default_security_id_init(vol, va)) {
2657					is_retry = TRUE;
2658					goto retry;
2659				}
2660			}
2661		}
2662		a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset));
2663		/* Add the standard information attribute. */
2664		ntfs_standard_info_attribute_insert(m, a, file_attrs,
2665				security_id, &va->va_create_time);
2666		a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length));
2667		/*
2668		 * If @security_id is zero, add the security descriptor
2669		 * attribute.  If it is not zero, we have already set the
2670		 * security_id in the standard information attribute to
2671		 * reference our security descriptor in $Secure.
2672		 */
2673		if (!security_id) {
2674			/* Add the security descriptor attribute. */
2675			ntfs_sd_attribute_insert(vol, m, a, va);
2676			a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length));
2677		}
2678		if (va->va_type == VDIR) {
2679			/* Add the empty, $I30 index root attribute. */
2680			ntfs_index_root_attribute_insert(vol, m, a);
2681		} else {
2682			INTX_FILE *ix;
2683			u32 data_len;
2684
2685			/*
2686			 * FIXME: For encrypted files, we need to add an empty,
2687			 * non-resident $DATA attribute and we need to add the
2688			 * $EFS attribute.  For now, we should never get here
2689			 * as we clear the encrypted bit above because we do
2690			 * not support creating encrypted files.
2691			 */
2692			if (file_attrs & FILE_ATTR_ENCRYPTED)
2693				panic("%s(): file_attrs & "
2694						"FILE_ATTR_ENCRYPTED\n",
2695						__FUNCTION__);
2696			switch (va->va_type) {
2697			case VBLK:
2698			case VCHR:
2699				/*
2700				 * In Services for Unix on Windows, a device
2701				 * special file is a system file whose $DATA
2702				 * attribute contains the INTX_FILE structure.
2703				 */
2704				data_len = offsetof(INTX_FILE, device) +
2705						sizeof(ix->device);
2706				break;
2707			case VSOCK:
2708				/*
2709				 * In Services for Unix on Windows, a socket is
2710				 * a system file with a $DATA attribute of
2711				 * length 1.
2712				 */
2713				data_len = 1;
2714				break;
2715			case VFIFO:
2716				/*
2717				 * On Services for Unix on Windows, a fifo is a
2718				 * system file with a zero-length $DATA
2719				 * attribute so fall through to the default
2720				 * case.
2721				 */
2722			default:
2723				data_len = 0;
2724				break;
2725			}
2726			/*
2727			 * Insert the empty, resident $DATA attribute.  This
2728			 * cannot fail as we are dealing with an empty mft
2729			 * record so there must be enough space for an empty
2730			 * $DATA attribute.
2731			 */
2732			if (ntfs_resident_attr_record_insert_internal(m, a,
2733					AT_DATA, NULL, 0, data_len))
2734				panic("%s(): Failed to insert resident data "
2735						"attribute.\n", __FUNCTION__);
2736			/*
2737			 * If this is a device special inode then set up the
2738			 * INTX_FILE structure inside the created $DATA
2739			 * attribute.
2740			 */
2741			if (va->va_type == VBLK || va->va_type == VCHR) {
2742				ix = (INTX_FILE*)((u8*)a +
2743						le16_to_cpu(a->value_offset));
2744				if (va->va_type == VBLK)
2745					ix->magic = INTX_BLOCK_DEVICE;
2746				else
2747					ix->magic = INTX_CHAR_DEVICE;
2748				ix->device.major = cpu_to_le64(
2749						major(va->va_rdev));
2750				ix->device.minor = cpu_to_le64(
2751						minor(va->va_rdev));
2752			}
2753		}
2754		/* Allocate a new ntfs inode and set it up. */
2755		na = (ntfs_attr) {
2756			.mft_no = bit,
2757			.type = AT_UNUSED,
2758			.raw = FALSE,
2759		};
2760		ni = ntfs_inode_hash_get(vol, &na);
2761		if (!ni) {
2762			ntfs_error(vol->mp, "Failed to allocate ntfs inode "
2763					"(ENOMEM).");
2764			err = ENOMEM;
2765			/* Set the mft record itself not in use. */
2766			m->flags &= ~MFT_RECORD_IN_USE;
2767			dirty_buf = TRUE;
2768			goto unmap_undo_mftbmp_alloc;
2769		}
2770		/*
2771		 * This inode cannot still be in the inode cache as we would
2772		 * have removed it when it was deleted last time.
2773		 */
2774		if (!NInoAlloc(ni))
2775			panic("%s(): !NInoAlloc(ni)\n", __FUNCTION__);
2776		ni->seq_no = le16_to_cpu(m->sequence_number);
2777		/*
2778		 * Set the appropriate mode, attribute type, and name.  For
2779		 * directories, also set up the index values to the defaults.
2780		 */
2781		ni->mode |= ACCESSPERMS;
2782		if (va->va_type == VDIR) {
2783			ni->mode |= S_IFDIR;
2784			ni->mode &= ~vol->dmask;
2785			NInoSetMstProtected(ni);
2786			ni->type = AT_INDEX_ALLOCATION;
2787			ni->name = I30;
2788			ni->name_len = 4;
2789			ni->vcn_size = 0;
2790			ni->collation_rule = 0;
2791			ni->vcn_size_shift = 0;
2792		} else /* if (va->va_type == VREG || va->va_type == VLNK) */ {
2793			switch (va->va_type) {
2794			case VREG:
2795				ni->mode |= S_IFREG;
2796				break;
2797			case VLNK:
2798				ni->mode |= S_IFLNK;
2799				break;
2800			case VSOCK:
2801				ni->mode |= S_IFSOCK;
2802				break;
2803			case VFIFO:
2804				ni->mode |= S_IFIFO;
2805				break;
2806			case VBLK:
2807				ni->mode |= S_IFBLK;
2808				ni->rdev = va->va_rdev;
2809				break;
2810			case VCHR:
2811				ni->mode |= S_IFCHR;
2812				ni->rdev = va->va_rdev;
2813				break;
2814			default:
2815				panic("%s(): Should never have gotten here "
2816						"for va->va_type 0x%x.\n",
2817						__FUNCTION__, va->va_type);
2818			}
2819			if (!S_ISLNK(ni->mode))
2820				ni->mode &= ~vol->fmask;
2821			ni->type = AT_DATA;
2822			/* ni->name = NULL; */
2823			/* ni->name_len = 0; */
2824			if (file_attrs & FILE_ATTR_COMPRESSED) {
2825				// TODO: Set up all the @ni->compress* fields...
2826				// For now it does not matter as we do not
2827				// allow creation of compressed files.
2828				panic("%s(): file_attrs & "
2829						"FILE_ATTR_COMPRESSED\n",
2830						__FUNCTION__);
2831			}
2832		}
2833		ni->file_attributes = file_attrs;
2834		if (file_attrs & FILE_ATTR_COMPRESSED)
2835			NInoSetCompressed(ni);
2836		if (file_attrs & FILE_ATTR_ENCRYPTED)
2837			NInoSetEncrypted(ni);
2838		if (file_attrs & FILE_ATTR_SPARSE_FILE)
2839			NInoSetSparse(ni);
2840		ni->last_access_time = ni->last_mft_change_time =
2841				ni->last_data_change_time = ni->creation_time =
2842				va->va_create_time;
2843		/* Initialize the backup time and Finder info cache. */
2844		ntfs_inode_afpinfo_cache(ni, NULL, 0);
2845		/*
2846		 * If it is a symbolic link set the Finder info type and
2847		 * creator appropriately and mark it dirty.  We will create the
2848		 * AFP_AfpInfo attribute later when the inode is ready for it.
2849		 */
2850		if (va->va_type == VLNK) {
2851			ni->finder_info.type = FINDER_TYPE_SYMBOLIC_LINK;
2852			ni->finder_info.creator = FINDER_CREATOR_SYMBOLIC_LINK;
2853			NInoSetDirtyFinderInfo(ni);
2854		}
2855		/* Tell the caller what mode and flags we actually used. */
2856		va->va_mode = ni->mode;
2857		va->va_flags = 0;
2858		if (file_attrs & FILE_ATTR_READONLY)
2859			va->va_flags |= UF_IMMUTABLE;
2860		if (file_attrs & FILE_ATTR_HIDDEN)
2861			va->va_flags |= UF_HIDDEN;
2862		if (!(file_attrs & FILE_ATTR_ARCHIVE))
2863			va->va_flags |= SF_ARCHIVED;
2864		/* The ntfs inode is now fully setup so we now add the vnode. */
2865		err = ntfs_inode_add_vnode(ni, FALSE, base_ni->vn, cn);
2866		if (err) {
2867			/* Destroy the allocated ntfs inode. */
2868			ntfs_inode_reclaim(ni);
2869			/* Set the mft record itself not in use. */
2870			m->flags &= ~MFT_RECORD_IN_USE;
2871			dirty_buf = TRUE;
2872			goto unmap_undo_mftbmp_alloc;
2873		}
2874		/*
2875		 * Need to release the buffer so that we can call
2876		 * ntfs_mft_record_map().
2877		 *
2878		 * FIXME: This could be optimized by modifying
2879		 * ntfs_mft_record_map() to take an optional mft record, i.e.
2880		 * @m, and if supplied using this instead of trying to map the
2881		 * extent mft record.
2882		 */
2883		err = buf_unmap(buf);
2884		if (err)
2885			ntfs_error(vol->mp, "Failed to unmap buffer of mft "
2886					"record 0x%llx (error %d).",
2887					(unsigned long long)bit, err);
2888		err = buf_bdwrite(buf);
2889		if (err) {
2890			ntfs_error(vol->mp, "Failed to write buffer of mft "
2891					"record 0x%llx (error %d).  Run "
2892					"chkdsk.", (unsigned long long)bit,
2893					err);
2894			NVolSetErrors(vol);
2895			lck_rw_unlock_shared(&mft_ni->lock);
2896			ntfs_inode_unlock_alloc(ni);
2897			(void)vnode_recycle(ni->vn);
2898			(void)vnode_put(ni->vn);
2899			goto free_undo_mftbmp_alloc;
2900		}
2901		err = ntfs_mft_record_map_ext(ni, &m, TRUE);
2902		lck_rw_unlock_shared(&mft_ni->lock);
2903		if (err) {
2904			ntfs_inode_unlock_alloc(ni);
2905			(void)vnode_recycle(ni->vn);
2906			(void)vnode_put(ni->vn);
2907			goto free_undo_mftbmp_alloc;
2908		}
2909		a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset));
2910		if (a->type != AT_STANDARD_INFORMATION)
2911			panic("%s(): a->type != AT_STANDARD_INFORMATION\n",
2912					__FUNCTION__);
2913		a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length));
2914		if (le32_to_cpu(a->type) <= const_le32_to_cpu(AT_FILENAME))
2915			panic("%s(): a->type <= AT_FILENAME\n", __FUNCTION__);
2916		/* This is where the filename attribute needs to be inserted. */
2917		*new_a = a;
2918	}
2919	/* Make sure the (extent) inode is written out to disk. */
2920	NInoSetMrecNeedsDirtying(ni);
2921	/*
2922	 * Drop the taken iocount references on the mft and mftbmp vnodes.
2923	 *
2924	 * Note we still retain an iocount reference on the mft vnode due to
2925	 * the above call to ntfs_{,extent_}mft_record_map().
2926	 */
2927	(void)vnode_put(mftbmp_ni->vn);
2928	if (va)
2929		(void)vnode_put(mft_ni->vn);
2930	/*
2931	 * Return the opened, allocated inode of the allocated mft record as
2932	 * well as the mapped mft record.
2933	 */
2934	ntfs_debug("Returning allocated %sntfs inode (mft_no 0x%llx).",
2935			va ? "" : "extent ", (unsigned long long)bit);
2936	*new_ni = ni;
2937	*new_m = m;
2938	return err;
2939undo_data_init:
2940	lck_spin_lock(&mft_ni->size_lock);
2941	mft_ni->initialized_size = old_data_initialized;
2942	lck_spin_unlock(&mft_ni->size_lock);
2943	if (!ubc_setsize(mft_ni->vn, old_data_size))
2944		panic("%s(): !ubc_setsize(mft_ni->vn, old_data_size)\n",
2945				__FUNCTION__);
2946	lck_spin_lock(&mft_ni->size_lock);
2947	mft_ni->data_size = old_data_size;
2948	lck_spin_unlock(&mft_ni->size_lock);
2949	lck_rw_unlock_exclusive(&mft_ni->lock);
2950	goto undo_mftbmp_alloc_locked;
2951free_undo_mftbmp_alloc:
2952	lck_rw_lock_shared(&mft_ni->lock);
2953	err2 = buf_meta_bread(mft_ni->vn, bit, vol->mft_record_size, NOCRED,
2954			&buf);
2955	if (err2) {
2956		ntfs_error(vol->mp, "Failed to re-read buffer of mft record "
2957				"0x%llx in error code path (error %d).%s",
2958				(unsigned long long)bit, err2, es);
2959		NVolSetErrors(vol);
2960		goto undo_mftbmp_alloc;
2961	}
2962	err2 = buf_map(buf, (caddr_t*)&m);
2963	if (err2) {
2964		ntfs_error(vol->mp, "Failed to re-map buffer of mft record "
2965				"0x%llx in error code path (error %d).%s",
2966				(unsigned long long)bit, err2, es);
2967		NVolSetErrors(vol);
2968		goto undo_mftbmp_alloc;
2969	}
2970	/* Set the mft record itself not in use. */
2971	m->flags &= ~MFT_RECORD_IN_USE;
2972	dirty_buf = TRUE;
2973unmap_undo_mftbmp_alloc:
2974	err2 = buf_unmap(buf);
2975	if (err2)
2976		ntfs_error(vol->mp, "Failed to unmap buffer of mft record "
2977				"0x%llx (error %d).", (unsigned long long)bit,
2978				err2);
2979undo_mftbmp_alloc:
2980	if (dirty_buf) {
2981		err2 = buf_bdwrite(buf);
2982		if (err2)
2983			ntfs_error(vol->mp, "Failed to write buffer of mft "
2984					"record 0x%llx in error code path "
2985					"(error %d).", (unsigned long long)bit,
2986					err2);
2987	} else
2988		buf_brelse(buf);
2989	lck_rw_unlock_shared(&mft_ni->lock);
2990	lck_rw_lock_exclusive(&vol->mftbmp_lock);
2991	/*
2992	 * We decremented the cached number of free mft records thus we need to
2993	 * increment it again here now that we are not allocating the mft
2994	 * record after all.
2995	 */
2996	vol->nr_free_mft_records++;
2997	/*
2998	 * Restore the previous mft data position but only if no-one else has
2999	 * restored it to something even older whilst we had dropped the lock.
3000	 */
3001	if (old_mft_data_pos < vol->mft_data_pos)
3002		vol->mft_data_pos = old_mft_data_pos;
3003undo_mftbmp_alloc_locked:
3004	lck_rw_lock_shared(&mftbmp_ni->lock);
3005	if (ntfs_bitmap_clear_bit(mftbmp_ni, bit)) {
3006		ntfs_error(vol->mp, "Failed to clear bit in mft bitmap.%s", es);
3007		NVolSetErrors(vol);
3008		/*
3009		 * We failed to clear the bit thus we are wasting an mft record
3010		 * and since its bit is set in the mft bitmap it is effectively
3011		 * in use thus it is not free.  So decrement the number of free
3012		 * mft records again.
3013		 */
3014		vol->nr_free_mft_records--;
3015		if (vol->nr_free_mft_records < 0)
3016			vol->nr_free_mft_records = 0;
3017	}
3018	lck_rw_unlock_shared(&mftbmp_ni->lock);
3019err:
3020	lck_rw_unlock_exclusive(&vol->mftbmp_lock);
3021	(void)vnode_put(mftbmp_ni->vn);
3022	if (va)
3023		(void)vnode_put(mft_ni->vn);
3024	return err;
3025max_err:
3026	ntfs_warning(vol->mp, "Cannot allocate mft record because the maximum "
3027			"number of inodes (2^32) has already been reached.");
3028	err = ENOSPC;
3029unl_err:
3030	lck_rw_unlock_exclusive(&mftbmp_ni->lock);
3031	goto err;
3032}
3033
3034/**
3035 * ntfs_extent_mft_record_free - free an extent mft record on an ntfs volume
3036 * @base_ni:	base ntfs inode to which the extent inode to be freed belongs
3037 * @ni:		ntfs inode of the mapped extent mft record to free
3038 * @m:		mapped extent mft record of the ntfs inode @ni
3039 *
3040 * Free the mapped extent mft record @m of the extent ntfs inode @ni belonging
3041 * to the base ntfs inode @base_ni.
3042 *
3043 * Note that this function unmaps the mft record and closes and destroys @ni
3044 * internally and hence you cannot use either the inode nor its mft record any
3045 * more after this function returns success.
3046 *
3047 * Return 0 on success and errno on error.  In the error case @ni and @m are
3048 * still valid and have not been freed.
3049 *
3050 * For some errors an error message is displayed and the success code 0 is
3051 * returned and the volume is then left dirty on umount.  This makes sense in
3052 * case we could not rollback the changes that were already done since the
3053 * caller no longer wants to reference this mft record so it does not matter to
3054 * the caller if something is wrong with it as long as it is properly detached
3055 * from the base inode.
3056 */
3057errno_t ntfs_extent_mft_record_free(ntfs_inode *base_ni, ntfs_inode *ni,
3058		MFT_RECORD *m)
3059{
3060	ino64_t mft_no = ni->mft_no;
3061	ntfs_volume *vol = ni->vol;
3062	ntfs_inode **extent_nis;
3063	int i;
3064	errno_t err;
3065	u16 seq_no;
3066
3067	ntfs_debug("Entering for extent mft_no 0x%llx, base mft_no 0x%llx.\n",
3068			(unsigned long long)mft_no,
3069			(unsigned long long)base_ni->mft_no);
3070	if (NInoAttr(ni))
3071		panic("%s(): NInoAttr(ni)\n", __FUNCTION__);
3072	if (ni->nr_extents != -1)
3073		panic("%s(): ni->nr_extents != -1\n", __FUNCTION__);
3074	if (base_ni->nr_extents <= 0)
3075		panic("%s(): base_ni->nr_extents <= 0\n", __FUNCTION__);
3076	lck_mtx_lock(&base_ni->extent_lock);
3077	/* Dissociate the ntfs inode from the base inode. */
3078	extent_nis = base_ni->extent_nis;
3079	err = ENOENT;
3080	for (i = 0; i < base_ni->nr_extents; i++) {
3081		if (ni != extent_nis[i])
3082			continue;
3083		extent_nis += i;
3084		base_ni->nr_extents--;
3085		if (base_ni->nr_extents > 0) {
3086			/*
3087			 * We do not bother reallocating memory for the array
3088			 * to shrink it as in the worst case we are wasting a
3089			 * bit of memory until the inode is thrown out of the
3090			 * cache or until all extent mft records are removed in
3091			 * which case we will free the whole array below.
3092			 */
3093			memmove(extent_nis, extent_nis + 1,
3094					(base_ni->nr_extents - i) *
3095					sizeof(ntfs_inode*));
3096		} else {
3097			if (base_ni->nr_extents < 0)
3098				panic("%s(): base_ni->nr_extents < 0\n",
3099						__FUNCTION__);
3100			OSFree(base_ni->extent_nis, base_ni->extent_alloc,
3101					ntfs_malloc_tag);
3102			base_ni->extent_alloc = 0;
3103		}
3104		err = 0;
3105		break;
3106	}
3107	lck_mtx_unlock(&base_ni->extent_lock);
3108	if (err)
3109		panic("%s(): Extent mft_no 0x%llx is not attached to "
3110				"its base mft_no 0x%llx.\n", __FUNCTION__,
3111				(unsigned long long)mft_no,
3112				(unsigned long long)base_ni->mft_no);
3113	/*
3114	 * The extent inode is no longer attached to the base inode so we can
3115	 * proceed to free it as no one can get a reference to it now because
3116	 * we still hold the base mft record mapped.
3117	 *
3118	 * Begin by setting the mft record itself not in use and then increment
3119	 * the sequence number, skipping zero, if it is not zero.
3120	 */
3121	m->flags &= ~MFT_RECORD_IN_USE;
3122	seq_no = le16_to_cpu(m->sequence_number);
3123	if (seq_no == 0xffff)
3124		seq_no = 1;
3125	else if (seq_no)
3126		seq_no++;
3127	m->sequence_number = cpu_to_le16(seq_no);
3128	/* Make sure the mft record is written out to disk. */
3129	NInoSetMrecNeedsDirtying(ni);
3130	/*
3131	 * Unmap and throw away the now freed extent inode.  The mft record
3132	 * will be written out later by the VM due to its page being marked
3133	 * dirty.
3134	 */
3135	ntfs_extent_mft_record_unmap(ni);
3136	ntfs_inode_reclaim(ni);
3137	/*
3138	 * Clear the bit in the $MFT/$BITMAP corresponding to this record thus
3139	 * making it available for someone else to allocate it.
3140	 */
3141	lck_rw_lock_exclusive(&vol->mftbmp_lock);
3142	err = vnode_get(vol->mftbmp_ni->vn);
3143	if (err)
3144		ntfs_error(vol->mp, "Failed to get vnode for $MFT/$BITMAP.");
3145	else {
3146		lck_rw_lock_shared(&vol->mftbmp_ni->lock);
3147		err = ntfs_bitmap_clear_bit(vol->mftbmp_ni, mft_no);
3148		lck_rw_unlock_shared(&vol->mftbmp_ni->lock);
3149		(void)vnode_put(vol->mftbmp_ni->vn);
3150		if (!err) {
3151			/*
3152			 * We cleared a bit in the mft bitmap thus we need to
3153			 * reflect this in the cached number of free mft
3154			 * records.
3155			 */
3156			vol->nr_free_mft_records++;
3157			if (vol->nr_free_mft_records >= vol->nr_mft_records)
3158				panic("%s(): vol->nr_free_mft_records > "
3159						"vol->nr_mft_records\n",
3160						__FUNCTION__);
3161		}
3162	}
3163	lck_rw_unlock_exclusive(&vol->mftbmp_lock);
3164	if (err) {
3165		/*
3166		 * The extent inode is gone but we failed to deallocate it in
3167		 * the mft bitmap.  Just emit a warning and leave the volume
3168		 * dirty on umount.
3169		 */
3170		ntfs_error(vol->mp, "Failed to mark extent mft record as "
3171				"unused in mft bitmap.%s", es);
3172		NVolSetErrors(vol);
3173	}
3174	return 0;
3175}
3176