1/**
2 * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
3 *
4 * Copyright (c) 2001-2006 Anton Altaparmakov
5 * Copyright (c) 2002 Richard Russon
6 *
7 * This program/include file is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as published
9 * by the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program/include file is distributed in the hope that it will be
13 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program (in the main directory of the Linux-NTFS
19 * distribution in the file COPYING); if not, write to the Free Software
20 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
21 */
22
23#include <linux/buffer_head.h>
24#include <linux/swap.h>
25
26#include "attrib.h"
27#include "aops.h"
28#include "bitmap.h"
29#include "debug.h"
30#include "dir.h"
31#include "lcnalloc.h"
32#include "malloc.h"
33#include "mft.h"
34#include "ntfs.h"
35
36/**
37 * map_mft_record_page - map the page in which a specific mft record resides
38 * @ni:		ntfs inode whose mft record page to map
39 *
40 * This maps the page in which the mft record of the ntfs inode @ni is situated
41 * and returns a pointer to the mft record within the mapped page.
42 *
43 * Return value needs to be checked with IS_ERR() and if that is true PTR_ERR()
44 * contains the negative error code returned.
45 */
46static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
47{
48	loff_t i_size;
49	ntfs_volume *vol = ni->vol;
50	struct inode *mft_vi = vol->mft_ino;
51	struct page *page;
52	unsigned long index, end_index;
53	unsigned ofs;
54
55	BUG_ON(ni->page);
56	index = (u64)ni->mft_no << vol->mft_record_size_bits >>
57			PAGE_CACHE_SHIFT;
58	ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
59
60	i_size = i_size_read(mft_vi);
61	/* The maximum valid index into the page cache for $MFT's data. */
62	end_index = i_size >> PAGE_CACHE_SHIFT;
63
64	/* If the wanted index is out of bounds the mft record doesn't exist. */
65	if (unlikely(index >= end_index)) {
66		if (index > end_index || (i_size & ~PAGE_CACHE_MASK) < ofs +
67				vol->mft_record_size) {
68			page = ERR_PTR(-ENOENT);
69			ntfs_error(vol->sb, "Attemt to read mft record 0x%lx, "
70					"which is beyond the end of the mft.  "
71					"This is probably a bug in the ntfs "
72					"driver.", ni->mft_no);
73			goto err_out;
74		}
75	}
76	/* Read, map, and pin the page. */
77	page = ntfs_map_page(mft_vi->i_mapping, index);
78	if (likely(!IS_ERR(page))) {
79		/* Catch multi sector transfer fixup errors. */
80		if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) +
81				ofs)))) {
82			ni->page = page;
83			ni->page_ofs = ofs;
84			return page_address(page) + ofs;
85		}
86		ntfs_error(vol->sb, "Mft record 0x%lx is corrupt.  "
87				"Run chkdsk.", ni->mft_no);
88		ntfs_unmap_page(page);
89		page = ERR_PTR(-EIO);
90		NVolSetErrors(vol);
91	}
92err_out:
93	ni->page = NULL;
94	ni->page_ofs = 0;
95	return (void*)page;
96}
97
98/**
99 * map_mft_record - map, pin and lock an mft record
100 * @ni:		ntfs inode whose MFT record to map
101 *
102 * First, take the mrec_lock mutex.  We might now be sleeping, while waiting
103 * for the mutex if it was already locked by someone else.
104 *
105 * The page of the record is mapped using map_mft_record_page() before being
106 * returned to the caller.
107 *
108 * This in turn uses ntfs_map_page() to get the page containing the wanted mft
109 * record (it in turn calls read_cache_page() which reads it in from disk if
110 * necessary, increments the use count on the page so that it cannot disappear
111 * under us and returns a reference to the page cache page).
112 *
113 * If read_cache_page() invokes ntfs_readpage() to load the page from disk, it
114 * sets PG_locked and clears PG_uptodate on the page. Once I/O has completed
115 * and the post-read mst fixups on each mft record in the page have been
116 * performed, the page gets PG_uptodate set and PG_locked cleared (this is done
117 * in our asynchronous I/O completion handler end_buffer_read_mft_async()).
118 * ntfs_map_page() waits for PG_locked to become clear and checks if
119 * PG_uptodate is set and returns an error code if not. This provides
120 * sufficient protection against races when reading/using the page.
121 *
122 * However there is the write mapping to think about. Doing the above described
123 * checking here will be fine, because when initiating the write we will set
124 * PG_locked and clear PG_uptodate making sure nobody is touching the page
125 * contents. Doing the locking this way means that the commit to disk code in
126 * the page cache code paths is automatically sufficiently locked with us as
127 * we will not touch a page that has been locked or is not uptodate. The only
128 * locking problem then is them locking the page while we are accessing it.
129 *
130 * So that code will end up having to own the mrec_lock of all mft
131 * records/inodes present in the page before I/O can proceed. In that case we
132 * wouldn't need to bother with PG_locked and PG_uptodate as nobody will be
133 * accessing anything without owning the mrec_lock mutex.  But we do need to
134 * use them because of the read_cache_page() invocation and the code becomes so
135 * much simpler this way that it is well worth it.
136 *
137 * The mft record is now ours and we return a pointer to it. You need to check
138 * the returned pointer with IS_ERR() and if that is true, PTR_ERR() will return
139 * the error code.
140 *
141 * NOTE: Caller is responsible for setting the mft record dirty before calling
142 * unmap_mft_record(). This is obviously only necessary if the caller really
143 * modified the mft record...
144 * Q: Do we want to recycle one of the VFS inode state bits instead?
145 * A: No, the inode ones mean we want to change the mft record, not we want to
146 * write it out.
147 */
148MFT_RECORD *map_mft_record(ntfs_inode *ni)
149{
150	MFT_RECORD *m;
151
152	ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
153
154	/* Make sure the ntfs inode doesn't go away. */
155	atomic_inc(&ni->count);
156
157	/* Serialize access to this mft record. */
158	mutex_lock(&ni->mrec_lock);
159
160	m = map_mft_record_page(ni);
161	if (likely(!IS_ERR(m)))
162		return m;
163
164	mutex_unlock(&ni->mrec_lock);
165	atomic_dec(&ni->count);
166	ntfs_error(ni->vol->sb, "Failed with error code %lu.", -PTR_ERR(m));
167	return m;
168}
169
170/**
171 * unmap_mft_record_page - unmap the page in which a specific mft record resides
172 * @ni:		ntfs inode whose mft record page to unmap
173 *
174 * This unmaps the page in which the mft record of the ntfs inode @ni is
175 * situated and returns. This is a NOOP if highmem is not configured.
176 *
177 * The unmap happens via ntfs_unmap_page() which in turn decrements the use
178 * count on the page thus releasing it from the pinned state.
179 *
180 * We do not actually unmap the page from memory of course, as that will be
181 * done by the page cache code itself when memory pressure increases or
182 * whatever.
183 */
184static inline void unmap_mft_record_page(ntfs_inode *ni)
185{
186	BUG_ON(!ni->page);
187
188	// TODO: If dirty, blah...
189	ntfs_unmap_page(ni->page);
190	ni->page = NULL;
191	ni->page_ofs = 0;
192	return;
193}
194
195/**
196 * unmap_mft_record - release a mapped mft record
197 * @ni:		ntfs inode whose MFT record to unmap
198 *
199 * We release the page mapping and the mrec_lock mutex which unmaps the mft
200 * record and releases it for others to get hold of. We also release the ntfs
201 * inode by decrementing the ntfs inode reference count.
202 *
203 * NOTE: If caller has modified the mft record, it is imperative to set the mft
204 * record dirty BEFORE calling unmap_mft_record().
205 */
206void unmap_mft_record(ntfs_inode *ni)
207{
208	struct page *page = ni->page;
209
210	BUG_ON(!page);
211
212	ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
213
214	unmap_mft_record_page(ni);
215	mutex_unlock(&ni->mrec_lock);
216	atomic_dec(&ni->count);
217	/*
218	 * If pure ntfs_inode, i.e. no vfs inode attached, we leave it to
219	 * ntfs_clear_extent_inode() in the extent inode case, and to the
220	 * caller in the non-extent, yet pure ntfs inode case, to do the actual
221	 * tear down of all structures and freeing of all allocated memory.
222	 */
223	return;
224}
225
226/**
227 * map_extent_mft_record - load an extent inode and attach it to its base
228 * @base_ni:	base ntfs inode
229 * @mref:	mft reference of the extent inode to load
230 * @ntfs_ino:	on successful return, pointer to the ntfs_inode structure
231 *
232 * Load the extent mft record @mref and attach it to its base inode @base_ni.
233 * Return the mapped extent mft record if IS_ERR(result) is false.  Otherwise
234 * PTR_ERR(result) gives the negative error code.
235 *
236 * On successful return, @ntfs_ino contains a pointer to the ntfs_inode
237 * structure of the mapped extent inode.
238 */
239MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
240		ntfs_inode **ntfs_ino)
241{
242	MFT_RECORD *m;
243	ntfs_inode *ni = NULL;
244	ntfs_inode **extent_nis = NULL;
245	int i;
246	unsigned long mft_no = MREF(mref);
247	u16 seq_no = MSEQNO(mref);
248	bool destroy_ni = false;
249
250	ntfs_debug("Mapping extent mft record 0x%lx (base mft record 0x%lx).",
251			mft_no, base_ni->mft_no);
252	/* Make sure the base ntfs inode doesn't go away. */
253	atomic_inc(&base_ni->count);
254	/*
255	 * Check if this extent inode has already been added to the base inode,
256	 * in which case just return it. If not found, add it to the base
257	 * inode before returning it.
258	 */
259	mutex_lock(&base_ni->extent_lock);
260	if (base_ni->nr_extents > 0) {
261		extent_nis = base_ni->ext.extent_ntfs_inos;
262		for (i = 0; i < base_ni->nr_extents; i++) {
263			if (mft_no != extent_nis[i]->mft_no)
264				continue;
265			ni = extent_nis[i];
266			/* Make sure the ntfs inode doesn't go away. */
267			atomic_inc(&ni->count);
268			break;
269		}
270	}
271	if (likely(ni != NULL)) {
272		mutex_unlock(&base_ni->extent_lock);
273		atomic_dec(&base_ni->count);
274		/* We found the record; just have to map and return it. */
275		m = map_mft_record(ni);
276		/* map_mft_record() has incremented this on success. */
277		atomic_dec(&ni->count);
278		if (likely(!IS_ERR(m))) {
279			/* Verify the sequence number. */
280			if (likely(le16_to_cpu(m->sequence_number) == seq_no)) {
281				ntfs_debug("Done 1.");
282				*ntfs_ino = ni;
283				return m;
284			}
285			unmap_mft_record(ni);
286			ntfs_error(base_ni->vol->sb, "Found stale extent mft "
287					"reference! Corrupt filesystem. "
288					"Run chkdsk.");
289			return ERR_PTR(-EIO);
290		}
291map_err_out:
292		ntfs_error(base_ni->vol->sb, "Failed to map extent "
293				"mft record, error code %ld.", -PTR_ERR(m));
294		return m;
295	}
296	/* Record wasn't there. Get a new ntfs inode and initialize it. */
297	ni = ntfs_new_extent_inode(base_ni->vol->sb, mft_no);
298	if (unlikely(!ni)) {
299		mutex_unlock(&base_ni->extent_lock);
300		atomic_dec(&base_ni->count);
301		return ERR_PTR(-ENOMEM);
302	}
303	ni->vol = base_ni->vol;
304	ni->seq_no = seq_no;
305	ni->nr_extents = -1;
306	ni->ext.base_ntfs_ino = base_ni;
307	/* Now map the record. */
308	m = map_mft_record(ni);
309	if (IS_ERR(m)) {
310		mutex_unlock(&base_ni->extent_lock);
311		atomic_dec(&base_ni->count);
312		ntfs_clear_extent_inode(ni);
313		goto map_err_out;
314	}
315	/* Verify the sequence number if it is present. */
316	if (seq_no && (le16_to_cpu(m->sequence_number) != seq_no)) {
317		ntfs_error(base_ni->vol->sb, "Found stale extent mft "
318				"reference! Corrupt filesystem. Run chkdsk.");
319		destroy_ni = true;
320		m = ERR_PTR(-EIO);
321		goto unm_err_out;
322	}
323	/* Attach extent inode to base inode, reallocating memory if needed. */
324	if (!(base_ni->nr_extents & 3)) {
325		ntfs_inode **tmp;
326		int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode *);
327
328		tmp = kmalloc(new_size, GFP_NOFS);
329		if (unlikely(!tmp)) {
330			ntfs_error(base_ni->vol->sb, "Failed to allocate "
331					"internal buffer.");
332			destroy_ni = true;
333			m = ERR_PTR(-ENOMEM);
334			goto unm_err_out;
335		}
336		if (base_ni->nr_extents) {
337			BUG_ON(!base_ni->ext.extent_ntfs_inos);
338			memcpy(tmp, base_ni->ext.extent_ntfs_inos, new_size -
339					4 * sizeof(ntfs_inode *));
340			kfree(base_ni->ext.extent_ntfs_inos);
341		}
342		base_ni->ext.extent_ntfs_inos = tmp;
343	}
344	base_ni->ext.extent_ntfs_inos[base_ni->nr_extents++] = ni;
345	mutex_unlock(&base_ni->extent_lock);
346	atomic_dec(&base_ni->count);
347	ntfs_debug("Done 2.");
348	*ntfs_ino = ni;
349	return m;
350unm_err_out:
351	unmap_mft_record(ni);
352	mutex_unlock(&base_ni->extent_lock);
353	atomic_dec(&base_ni->count);
354	/*
355	 * If the extent inode was not attached to the base inode we need to
356	 * release it or we will leak memory.
357	 */
358	if (destroy_ni)
359		ntfs_clear_extent_inode(ni);
360	return m;
361}
362
363#ifdef NTFS_RW
364
365/**
366 * __mark_mft_record_dirty - set the mft record and the page containing it dirty
367 * @ni:		ntfs inode describing the mapped mft record
368 *
369 * Internal function.  Users should call mark_mft_record_dirty() instead.
370 *
371 * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni,
372 * as well as the page containing the mft record, dirty.  Also, mark the base
373 * vfs inode dirty.  This ensures that any changes to the mft record are
374 * written out to disk.
375 *
376 * NOTE:  We only set I_DIRTY_SYNC and I_DIRTY_DATASYNC (and not I_DIRTY_PAGES)
377 * on the base vfs inode, because even though file data may have been modified,
378 * it is dirty in the inode meta data rather than the data page cache of the
379 * inode, and thus there are no data pages that need writing out.  Therefore, a
380 * full mark_inode_dirty() is overkill.  A mark_inode_dirty_sync(), on the
381 * other hand, is not sufficient, because I_DIRTY_DATASYNC needs to be set to
382 * ensure ->write_inode is called from generic_osync_inode() and this needs to
383 * happen or the file data would not necessarily hit the device synchronously,
384 * even though the vfs inode has the O_SYNC flag set.  Also, I_DIRTY_DATASYNC
385 * simply "feels" better than just I_DIRTY_SYNC, since the file data has not
386 * actually hit the block device yet, which is not what I_DIRTY_SYNC on its own
387 * would suggest.
388 */
389void __mark_mft_record_dirty(ntfs_inode *ni)
390{
391	ntfs_inode *base_ni;
392
393	ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
394	BUG_ON(NInoAttr(ni));
395	mark_ntfs_record_dirty(ni->page, ni->page_ofs);
396	/* Determine the base vfs inode and mark it dirty, too. */
397	mutex_lock(&ni->extent_lock);
398	if (likely(ni->nr_extents >= 0))
399		base_ni = ni;
400	else
401		base_ni = ni->ext.base_ntfs_ino;
402	mutex_unlock(&ni->extent_lock);
403	__mark_inode_dirty(VFS_I(base_ni), I_DIRTY_SYNC | I_DIRTY_DATASYNC);
404}
405
406static const char *ntfs_please_email = "Please email "
407		"linux-ntfs-dev@lists.sourceforge.net and say that you saw "
408		"this message.  Thank you.";
409
410/**
411 * ntfs_sync_mft_mirror_umount - synchronise an mft record to the mft mirror
412 * @vol:	ntfs volume on which the mft record to synchronize resides
413 * @mft_no:	mft record number of mft record to synchronize
414 * @m:		mapped, mst protected (extent) mft record to synchronize
415 *
416 * Write the mapped, mst protected (extent) mft record @m with mft record
417 * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol,
418 * bypassing the page cache and the $MFTMirr inode itself.
419 *
420 * This function is only for use at umount time when the mft mirror inode has
421 * already been disposed off.  We BUG() if we are called while the mft mirror
422 * inode is still attached to the volume.
423 *
424 * On success return 0.  On error return -errno.
425 *
426 * NOTE:  This function is not implemented yet as I am not convinced it can
427 * actually be triggered considering the sequence of commits we do in super.c::
428 * ntfs_put_super().  But just in case we provide this place holder as the
429 * alternative would be either to BUG() or to get a NULL pointer dereference
430 * and Oops.
431 */
432static int ntfs_sync_mft_mirror_umount(ntfs_volume *vol,
433		const unsigned long mft_no, MFT_RECORD *m)
434{
435	BUG_ON(vol->mftmirr_ino);
436	ntfs_error(vol->sb, "Umount time mft mirror syncing is not "
437			"implemented yet.  %s", ntfs_please_email);
438	return -EOPNOTSUPP;
439}
440
441/**
442 * ntfs_sync_mft_mirror - synchronize an mft record to the mft mirror
443 * @vol:	ntfs volume on which the mft record to synchronize resides
444 * @mft_no:	mft record number of mft record to synchronize
445 * @m:		mapped, mst protected (extent) mft record to synchronize
446 * @sync:	if true, wait for i/o completion
447 *
448 * Write the mapped, mst protected (extent) mft record @m with mft record
449 * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol.
450 *
451 * On success return 0.  On error return -errno and set the volume errors flag
452 * in the ntfs volume @vol.
453 *
454 * NOTE:  We always perform synchronous i/o and ignore the @sync parameter.
455 *
456 * TODO:  If @sync is false, want to do truly asynchronous i/o, i.e. just
457 * schedule i/o via ->writepage or do it via kntfsd or whatever.
458 */
459int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
460		MFT_RECORD *m, int sync)
461{
462	struct page *page;
463	unsigned int blocksize = vol->sb->s_blocksize;
464	int max_bhs = vol->mft_record_size / blocksize;
465	struct buffer_head *bhs[max_bhs];
466	struct buffer_head *bh, *head;
467	u8 *kmirr;
468	runlist_element *rl;
469	unsigned int block_start, block_end, m_start, m_end, page_ofs;
470	int i_bhs, nr_bhs, err = 0;
471	unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
472
473	ntfs_debug("Entering for inode 0x%lx.", mft_no);
474	BUG_ON(!max_bhs);
475	if (unlikely(!vol->mftmirr_ino)) {
476		/* This could happen during umount... */
477		err = ntfs_sync_mft_mirror_umount(vol, mft_no, m);
478		if (likely(!err))
479			return err;
480		goto err_out;
481	}
482	/* Get the page containing the mirror copy of the mft record @m. */
483	page = ntfs_map_page(vol->mftmirr_ino->i_mapping, mft_no >>
484			(PAGE_CACHE_SHIFT - vol->mft_record_size_bits));
485	if (IS_ERR(page)) {
486		ntfs_error(vol->sb, "Failed to map mft mirror page.");
487		err = PTR_ERR(page);
488		goto err_out;
489	}
490	lock_page(page);
491	BUG_ON(!PageUptodate(page));
492	ClearPageUptodate(page);
493	/* Offset of the mft mirror record inside the page. */
494	page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
495	/* The address in the page of the mirror copy of the mft record @m. */
496	kmirr = page_address(page) + page_ofs;
497	/* Copy the mst protected mft record to the mirror. */
498	memcpy(kmirr, m, vol->mft_record_size);
499	/* Create uptodate buffers if not present. */
500	if (unlikely(!page_has_buffers(page))) {
501		struct buffer_head *tail;
502
503		bh = head = alloc_page_buffers(page, blocksize, 1);
504		do {
505			set_buffer_uptodate(bh);
506			tail = bh;
507			bh = bh->b_this_page;
508		} while (bh);
509		tail->b_this_page = head;
510		attach_page_buffers(page, head);
511	}
512	bh = head = page_buffers(page);
513	BUG_ON(!bh);
514	rl = NULL;
515	nr_bhs = 0;
516	block_start = 0;
517	m_start = kmirr - (u8*)page_address(page);
518	m_end = m_start + vol->mft_record_size;
519	do {
520		block_end = block_start + blocksize;
521		/* If the buffer is outside the mft record, skip it. */
522		if (block_end <= m_start)
523			continue;
524		if (unlikely(block_start >= m_end))
525			break;
526		/* Need to map the buffer if it is not mapped already. */
527		if (unlikely(!buffer_mapped(bh))) {
528			VCN vcn;
529			LCN lcn;
530			unsigned int vcn_ofs;
531
532			bh->b_bdev = vol->sb->s_bdev;
533			/* Obtain the vcn and offset of the current block. */
534			vcn = ((VCN)mft_no << vol->mft_record_size_bits) +
535					(block_start - m_start);
536			vcn_ofs = vcn & vol->cluster_size_mask;
537			vcn >>= vol->cluster_size_bits;
538			if (!rl) {
539				down_read(&NTFS_I(vol->mftmirr_ino)->
540						runlist.lock);
541				rl = NTFS_I(vol->mftmirr_ino)->runlist.rl;
542				/*
543				 * $MFTMirr always has the whole of its runlist
544				 * in memory.
545				 */
546				BUG_ON(!rl);
547			}
548			/* Seek to element containing target vcn. */
549			while (rl->length && rl[1].vcn <= vcn)
550				rl++;
551			lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
552			/* For $MFTMirr, only lcn >= 0 is a successful remap. */
553			if (likely(lcn >= 0)) {
554				/* Setup buffer head to correct block. */
555				bh->b_blocknr = ((lcn <<
556						vol->cluster_size_bits) +
557						vcn_ofs) >> blocksize_bits;
558				set_buffer_mapped(bh);
559			} else {
560				bh->b_blocknr = -1;
561				ntfs_error(vol->sb, "Cannot write mft mirror "
562						"record 0x%lx because its "
563						"location on disk could not "
564						"be determined (error code "
565						"%lli).", mft_no,
566						(long long)lcn);
567				err = -EIO;
568			}
569		}
570		BUG_ON(!buffer_uptodate(bh));
571		BUG_ON(!nr_bhs && (m_start != block_start));
572		BUG_ON(nr_bhs >= max_bhs);
573		bhs[nr_bhs++] = bh;
574		BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
575	} while (block_start = block_end, (bh = bh->b_this_page) != head);
576	if (unlikely(rl))
577		up_read(&NTFS_I(vol->mftmirr_ino)->runlist.lock);
578	if (likely(!err)) {
579		/* Lock buffers and start synchronous write i/o on them. */
580		for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
581			struct buffer_head *tbh = bhs[i_bhs];
582
583			if (unlikely(test_set_buffer_locked(tbh)))
584				BUG();
585			BUG_ON(!buffer_uptodate(tbh));
586			clear_buffer_dirty(tbh);
587			get_bh(tbh);
588			tbh->b_end_io = end_buffer_write_sync;
589			submit_bh(WRITE, tbh);
590		}
591		/* Wait on i/o completion of buffers. */
592		for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
593			struct buffer_head *tbh = bhs[i_bhs];
594
595			wait_on_buffer(tbh);
596			if (unlikely(!buffer_uptodate(tbh))) {
597				err = -EIO;
598				/*
599				 * Set the buffer uptodate so the page and
600				 * buffer states do not become out of sync.
601				 */
602				set_buffer_uptodate(tbh);
603			}
604		}
605	} else /* if (unlikely(err)) */ {
606		/* Clean the buffers. */
607		for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
608			clear_buffer_dirty(bhs[i_bhs]);
609	}
610	/* Current state: all buffers are clean, unlocked, and uptodate. */
611	/* Remove the mst protection fixups again. */
612	post_write_mst_fixup((NTFS_RECORD*)kmirr);
613	flush_dcache_page(page);
614	SetPageUptodate(page);
615	unlock_page(page);
616	ntfs_unmap_page(page);
617	if (likely(!err)) {
618		ntfs_debug("Done.");
619	} else {
620		ntfs_error(vol->sb, "I/O error while writing mft mirror "
621				"record 0x%lx!", mft_no);
622err_out:
623		ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error "
624				"code %i).  Volume will be left marked dirty "
625				"on umount.  Run ntfsfix on the partition "
626				"after umounting to correct this.", -err);
627		NVolSetErrors(vol);
628	}
629	return err;
630}
631
632/**
633 * write_mft_record_nolock - write out a mapped (extent) mft record
634 * @ni:		ntfs inode describing the mapped (extent) mft record
635 * @m:		mapped (extent) mft record to write
636 * @sync:	if true, wait for i/o completion
637 *
638 * Write the mapped (extent) mft record @m described by the (regular or extent)
639 * ntfs inode @ni to backing store.  If the mft record @m has a counterpart in
640 * the mft mirror, that is also updated.
641 *
642 * We only write the mft record if the ntfs inode @ni is dirty and the first
643 * buffer belonging to its mft record is dirty, too.  We ignore the dirty state
644 * of subsequent buffers because we could have raced with
645 * fs/ntfs/aops.c::mark_ntfs_record_dirty().
646 *
647 * On success, clean the mft record and return 0.  On error, leave the mft
648 * record dirty and return -errno.
649 *
650 * NOTE:  We always perform synchronous i/o and ignore the @sync parameter.
651 * However, if the mft record has a counterpart in the mft mirror and @sync is
652 * true, we write the mft record, wait for i/o completion, and only then write
653 * the mft mirror copy.  This ensures that if the system crashes either the mft
654 * or the mft mirror will contain a self-consistent mft record @m.  If @sync is
655 * false on the other hand, we start i/o on both and then wait for completion
656 * on them.  This provides a speedup but no longer guarantees that you will end
657 * up with a self-consistent mft record in the case of a crash but if you asked
658 * for asynchronous writing you probably do not care about that anyway.
659 *
660 * TODO:  If @sync is false, want to do truly asynchronous i/o, i.e. just
661 * schedule i/o via ->writepage or do it via kntfsd or whatever.
662 */
663int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
664{
665	ntfs_volume *vol = ni->vol;
666	struct page *page = ni->page;
667	unsigned int blocksize = vol->sb->s_blocksize;
668	unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
669	int max_bhs = vol->mft_record_size / blocksize;
670	struct buffer_head *bhs[max_bhs];
671	struct buffer_head *bh, *head;
672	runlist_element *rl;
673	unsigned int block_start, block_end, m_start, m_end;
674	int i_bhs, nr_bhs, err = 0;
675
676	ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
677	BUG_ON(NInoAttr(ni));
678	BUG_ON(!max_bhs);
679	BUG_ON(!PageLocked(page));
680	/*
681	 * If the ntfs_inode is clean no need to do anything.  If it is dirty,
682	 * mark it as clean now so that it can be redirtied later on if needed.
683	 * There is no danger of races since the caller is holding the locks
684	 * for the mft record @m and the page it is in.
685	 */
686	if (!NInoTestClearDirty(ni))
687		goto done;
688	bh = head = page_buffers(page);
689	BUG_ON(!bh);
690	rl = NULL;
691	nr_bhs = 0;
692	block_start = 0;
693	m_start = ni->page_ofs;
694	m_end = m_start + vol->mft_record_size;
695	do {
696		block_end = block_start + blocksize;
697		/* If the buffer is outside the mft record, skip it. */
698		if (block_end <= m_start)
699			continue;
700		if (unlikely(block_start >= m_end))
701			break;
702		/*
703		 * If this block is not the first one in the record, we ignore
704		 * the buffer's dirty state because we could have raced with a
705		 * parallel mark_ntfs_record_dirty().
706		 */
707		if (block_start == m_start) {
708			/* This block is the first one in the record. */
709			if (!buffer_dirty(bh)) {
710				BUG_ON(nr_bhs);
711				/* Clean records are not written out. */
712				break;
713			}
714		}
715		/* Need to map the buffer if it is not mapped already. */
716		if (unlikely(!buffer_mapped(bh))) {
717			VCN vcn;
718			LCN lcn;
719			unsigned int vcn_ofs;
720
721			bh->b_bdev = vol->sb->s_bdev;
722			/* Obtain the vcn and offset of the current block. */
723			vcn = ((VCN)ni->mft_no << vol->mft_record_size_bits) +
724					(block_start - m_start);
725			vcn_ofs = vcn & vol->cluster_size_mask;
726			vcn >>= vol->cluster_size_bits;
727			if (!rl) {
728				down_read(&NTFS_I(vol->mft_ino)->runlist.lock);
729				rl = NTFS_I(vol->mft_ino)->runlist.rl;
730				BUG_ON(!rl);
731			}
732			/* Seek to element containing target vcn. */
733			while (rl->length && rl[1].vcn <= vcn)
734				rl++;
735			lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
736			/* For $MFT, only lcn >= 0 is a successful remap. */
737			if (likely(lcn >= 0)) {
738				/* Setup buffer head to correct block. */
739				bh->b_blocknr = ((lcn <<
740						vol->cluster_size_bits) +
741						vcn_ofs) >> blocksize_bits;
742				set_buffer_mapped(bh);
743			} else {
744				bh->b_blocknr = -1;
745				ntfs_error(vol->sb, "Cannot write mft record "
746						"0x%lx because its location "
747						"on disk could not be "
748						"determined (error code %lli).",
749						ni->mft_no, (long long)lcn);
750				err = -EIO;
751			}
752		}
753		BUG_ON(!buffer_uptodate(bh));
754		BUG_ON(!nr_bhs && (m_start != block_start));
755		BUG_ON(nr_bhs >= max_bhs);
756		bhs[nr_bhs++] = bh;
757		BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
758	} while (block_start = block_end, (bh = bh->b_this_page) != head);
759	if (unlikely(rl))
760		up_read(&NTFS_I(vol->mft_ino)->runlist.lock);
761	if (!nr_bhs)
762		goto done;
763	if (unlikely(err))
764		goto cleanup_out;
765	/* Apply the mst protection fixups. */
766	err = pre_write_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size);
767	if (err) {
768		ntfs_error(vol->sb, "Failed to apply mst fixups!");
769		goto cleanup_out;
770	}
771	flush_dcache_mft_record_page(ni);
772	/* Lock buffers and start synchronous write i/o on them. */
773	for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
774		struct buffer_head *tbh = bhs[i_bhs];
775
776		if (unlikely(test_set_buffer_locked(tbh)))
777			BUG();
778		BUG_ON(!buffer_uptodate(tbh));
779		clear_buffer_dirty(tbh);
780		get_bh(tbh);
781		tbh->b_end_io = end_buffer_write_sync;
782		submit_bh(WRITE, tbh);
783	}
784	/* Synchronize the mft mirror now if not @sync. */
785	if (!sync && ni->mft_no < vol->mftmirr_size)
786		ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync);
787	/* Wait on i/o completion of buffers. */
788	for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
789		struct buffer_head *tbh = bhs[i_bhs];
790
791		wait_on_buffer(tbh);
792		if (unlikely(!buffer_uptodate(tbh))) {
793			err = -EIO;
794			/*
795			 * Set the buffer uptodate so the page and buffer
796			 * states do not become out of sync.
797			 */
798			if (PageUptodate(page))
799				set_buffer_uptodate(tbh);
800		}
801	}
802	/* If @sync, now synchronize the mft mirror. */
803	if (sync && ni->mft_no < vol->mftmirr_size)
804		ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync);
805	/* Remove the mst protection fixups again. */
806	post_write_mst_fixup((NTFS_RECORD*)m);
807	flush_dcache_mft_record_page(ni);
808	if (unlikely(err)) {
809		/* I/O error during writing.  This is really bad! */
810		ntfs_error(vol->sb, "I/O error while writing mft record "
811				"0x%lx!  Marking base inode as bad.  You "
812				"should unmount the volume and run chkdsk.",
813				ni->mft_no);
814		goto err_out;
815	}
816done:
817	ntfs_debug("Done.");
818	return 0;
819cleanup_out:
820	/* Clean the buffers. */
821	for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
822		clear_buffer_dirty(bhs[i_bhs]);
823err_out:
824	/*
825	 * Current state: all buffers are clean, unlocked, and uptodate.
826	 * The caller should mark the base inode as bad so that no more i/o
827	 * happens.  ->clear_inode() will still be invoked so all extent inodes
828	 * and other allocated memory will be freed.
829	 */
830	if (err == -ENOMEM) {
831		ntfs_error(vol->sb, "Not enough memory to write mft record.  "
832				"Redirtying so the write is retried later.");
833		mark_mft_record_dirty(ni);
834		err = 0;
835	} else
836		NVolSetErrors(vol);
837	return err;
838}
839
840/**
841 * ntfs_may_write_mft_record - check if an mft record may be written out
842 * @vol:	[IN]  ntfs volume on which the mft record to check resides
843 * @mft_no:	[IN]  mft record number of the mft record to check
844 * @m:		[IN]  mapped mft record to check
845 * @locked_ni:	[OUT] caller has to unlock this ntfs inode if one is returned
846 *
847 * Check if the mapped (base or extent) mft record @m with mft record number
848 * @mft_no belonging to the ntfs volume @vol may be written out.  If necessary
849 * and possible the ntfs inode of the mft record is locked and the base vfs
850 * inode is pinned.  The locked ntfs inode is then returned in @locked_ni.  The
851 * caller is responsible for unlocking the ntfs inode and unpinning the base
852 * vfs inode.
853 *
854 * Return 'true' if the mft record may be written out and 'false' if not.
855 *
856 * The caller has locked the page and cleared the uptodate flag on it which
857 * means that we can safely write out any dirty mft records that do not have
858 * their inodes in icache as determined by ilookup5() as anyone
859 * opening/creating such an inode would block when attempting to map the mft
860 * record in read_cache_page() until we are finished with the write out.
861 *
862 * Here is a description of the tests we perform:
863 *
864 * If the inode is found in icache we know the mft record must be a base mft
865 * record.  If it is dirty, we do not write it and return 'false' as the vfs
866 * inode write paths will result in the access times being updated which would
867 * cause the base mft record to be redirtied and written out again.  (We know
868 * the access time update will modify the base mft record because Windows
869 * chkdsk complains if the standard information attribute is not in the base
870 * mft record.)
871 *
872 * If the inode is in icache and not dirty, we attempt to lock the mft record
873 * and if we find the lock was already taken, it is not safe to write the mft
874 * record and we return 'false'.
875 *
876 * If we manage to obtain the lock we have exclusive access to the mft record,
877 * which also allows us safe writeout of the mft record.  We then set
878 * @locked_ni to the locked ntfs inode and return 'true'.
879 *
880 * Note we cannot just lock the mft record and sleep while waiting for the lock
881 * because this would deadlock due to lock reversal (normally the mft record is
882 * locked before the page is locked but we already have the page locked here
883 * when we try to lock the mft record).
884 *
885 * If the inode is not in icache we need to perform further checks.
886 *
887 * If the mft record is not a FILE record or it is a base mft record, we can
888 * safely write it and return 'true'.
889 *
890 * We now know the mft record is an extent mft record.  We check if the inode
891 * corresponding to its base mft record is in icache and obtain a reference to
892 * it if it is.  If it is not, we can safely write it and return 'true'.
893 *
894 * We now have the base inode for the extent mft record.  We check if it has an
895 * ntfs inode for the extent mft record attached and if not it is safe to write
896 * the extent mft record and we return 'true'.
897 *
898 * The ntfs inode for the extent mft record is attached to the base inode so we
899 * attempt to lock the extent mft record and if we find the lock was already
900 * taken, it is not safe to write the extent mft record and we return 'false'.
901 *
902 * If we manage to obtain the lock we have exclusive access to the extent mft
903 * record, which also allows us safe writeout of the extent mft record.  We
904 * set the ntfs inode of the extent mft record clean and then set @locked_ni to
905 * the now locked ntfs inode and return 'true'.
906 *
907 * Note, the reason for actually writing dirty mft records here and not just
908 * relying on the vfs inode dirty code paths is that we can have mft records
909 * modified without them ever having actual inodes in memory.  Also we can have
910 * dirty mft records with clean ntfs inodes in memory.  None of the described
911 * cases would result in the dirty mft records being written out if we only
912 * relied on the vfs inode dirty code paths.  And these cases can really occur
913 * during allocation of new mft records and in particular when the
914 * initialized_size of the $MFT/$DATA attribute is extended and the new space
915 * is initialized using ntfs_mft_record_format().  The clean inode can then
916 * appear if the mft record is reused for a new inode before it got written
917 * out.
918 */
919bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
920		const MFT_RECORD *m, ntfs_inode **locked_ni)
921{
922	struct super_block *sb = vol->sb;
923	struct inode *mft_vi = vol->mft_ino;
924	struct inode *vi;
925	ntfs_inode *ni, *eni, **extent_nis;
926	int i;
927	ntfs_attr na;
928
929	ntfs_debug("Entering for inode 0x%lx.", mft_no);
930	/*
931	 * Normally we do not return a locked inode so set @locked_ni to NULL.
932	 */
933	BUG_ON(!locked_ni);
934	*locked_ni = NULL;
935	/*
936	 * Check if the inode corresponding to this mft record is in the VFS
937	 * inode cache and obtain a reference to it if it is.
938	 */
939	ntfs_debug("Looking for inode 0x%lx in icache.", mft_no);
940	na.mft_no = mft_no;
941	na.name = NULL;
942	na.name_len = 0;
943	na.type = AT_UNUSED;
944	/*
945	 * Optimize inode 0, i.e. $MFT itself, since we have it in memory and
946	 * we get here for it rather often.
947	 */
948	if (!mft_no) {
949		/* Balance the below iput(). */
950		vi = igrab(mft_vi);
951		BUG_ON(vi != mft_vi);
952	} else {
953		/*
954		 * Have to use ilookup5_nowait() since ilookup5() waits for the
955		 * inode lock which causes ntfs to deadlock when a concurrent
956		 * inode write via the inode dirty code paths and the page
957		 * dirty code path of the inode dirty code path when writing
958		 * $MFT occurs.
959		 */
960		vi = ilookup5_nowait(sb, mft_no, (test_t)ntfs_test_inode, &na);
961	}
962	if (vi) {
963		ntfs_debug("Base inode 0x%lx is in icache.", mft_no);
964		/* The inode is in icache. */
965		ni = NTFS_I(vi);
966		/* Take a reference to the ntfs inode. */
967		atomic_inc(&ni->count);
968		/* If the inode is dirty, do not write this record. */
969		if (NInoDirty(ni)) {
970			ntfs_debug("Inode 0x%lx is dirty, do not write it.",
971					mft_no);
972			atomic_dec(&ni->count);
973			iput(vi);
974			return false;
975		}
976		ntfs_debug("Inode 0x%lx is not dirty.", mft_no);
977		/* The inode is not dirty, try to take the mft record lock. */
978		if (unlikely(!mutex_trylock(&ni->mrec_lock))) {
979			ntfs_debug("Mft record 0x%lx is already locked, do "
980					"not write it.", mft_no);
981			atomic_dec(&ni->count);
982			iput(vi);
983			return false;
984		}
985		ntfs_debug("Managed to lock mft record 0x%lx, write it.",
986				mft_no);
987		/*
988		 * The write has to occur while we hold the mft record lock so
989		 * return the locked ntfs inode.
990		 */
991		*locked_ni = ni;
992		return true;
993	}
994	ntfs_debug("Inode 0x%lx is not in icache.", mft_no);
995	/* The inode is not in icache. */
996	/* Write the record if it is not a mft record (type "FILE"). */
997	if (!ntfs_is_mft_record(m->magic)) {
998		ntfs_debug("Mft record 0x%lx is not a FILE record, write it.",
999				mft_no);
1000		return true;
1001	}
1002	/* Write the mft record if it is a base inode. */
1003	if (!m->base_mft_record) {
1004		ntfs_debug("Mft record 0x%lx is a base record, write it.",
1005				mft_no);
1006		return true;
1007	}
1008	/*
1009	 * This is an extent mft record.  Check if the inode corresponding to
1010	 * its base mft record is in icache and obtain a reference to it if it
1011	 * is.
1012	 */
1013	na.mft_no = MREF_LE(m->base_mft_record);
1014	ntfs_debug("Mft record 0x%lx is an extent record.  Looking for base "
1015			"inode 0x%lx in icache.", mft_no, na.mft_no);
1016	if (!na.mft_no) {
1017		/* Balance the below iput(). */
1018		vi = igrab(mft_vi);
1019		BUG_ON(vi != mft_vi);
1020	} else
1021		vi = ilookup5_nowait(sb, na.mft_no, (test_t)ntfs_test_inode,
1022				&na);
1023	if (!vi) {
1024		/*
1025		 * The base inode is not in icache, write this extent mft
1026		 * record.
1027		 */
1028		ntfs_debug("Base inode 0x%lx is not in icache, write the "
1029				"extent record.", na.mft_no);
1030		return true;
1031	}
1032	ntfs_debug("Base inode 0x%lx is in icache.", na.mft_no);
1033	/*
1034	 * The base inode is in icache.  Check if it has the extent inode
1035	 * corresponding to this extent mft record attached.
1036	 */
1037	ni = NTFS_I(vi);
1038	mutex_lock(&ni->extent_lock);
1039	if (ni->nr_extents <= 0) {
1040		/*
1041		 * The base inode has no attached extent inodes, write this
1042		 * extent mft record.
1043		 */
1044		mutex_unlock(&ni->extent_lock);
1045		iput(vi);
1046		ntfs_debug("Base inode 0x%lx has no attached extent inodes, "
1047				"write the extent record.", na.mft_no);
1048		return true;
1049	}
1050	/* Iterate over the attached extent inodes. */
1051	extent_nis = ni->ext.extent_ntfs_inos;
1052	for (eni = NULL, i = 0; i < ni->nr_extents; ++i) {
1053		if (mft_no == extent_nis[i]->mft_no) {
1054			/*
1055			 * Found the extent inode corresponding to this extent
1056			 * mft record.
1057			 */
1058			eni = extent_nis[i];
1059			break;
1060		}
1061	}
1062	/*
1063	 * If the extent inode was not attached to the base inode, write this
1064	 * extent mft record.
1065	 */
1066	if (!eni) {
1067		mutex_unlock(&ni->extent_lock);
1068		iput(vi);
1069		ntfs_debug("Extent inode 0x%lx is not attached to its base "
1070				"inode 0x%lx, write the extent record.",
1071				mft_no, na.mft_no);
1072		return true;
1073	}
1074	ntfs_debug("Extent inode 0x%lx is attached to its base inode 0x%lx.",
1075			mft_no, na.mft_no);
1076	/* Take a reference to the extent ntfs inode. */
1077	atomic_inc(&eni->count);
1078	mutex_unlock(&ni->extent_lock);
1079	/*
1080	 * Found the extent inode coresponding to this extent mft record.
1081	 * Try to take the mft record lock.
1082	 */
1083	if (unlikely(!mutex_trylock(&eni->mrec_lock))) {
1084		atomic_dec(&eni->count);
1085		iput(vi);
1086		ntfs_debug("Extent mft record 0x%lx is already locked, do "
1087				"not write it.", mft_no);
1088		return false;
1089	}
1090	ntfs_debug("Managed to lock extent mft record 0x%lx, write it.",
1091			mft_no);
1092	if (NInoTestClearDirty(eni))
1093		ntfs_debug("Extent inode 0x%lx is dirty, marking it clean.",
1094				mft_no);
1095	/*
1096	 * The write has to occur while we hold the mft record lock so return
1097	 * the locked extent ntfs inode.
1098	 */
1099	*locked_ni = eni;
1100	return true;
1101}
1102
1103static const char *es = "  Leaving inconsistent metadata.  Unmount and run "
1104		"chkdsk.";
1105
1106/**
1107 * ntfs_mft_bitmap_find_and_alloc_free_rec_nolock - see name
1108 * @vol:	volume on which to search for a free mft record
1109 * @base_ni:	open base inode if allocating an extent mft record or NULL
1110 *
1111 * Search for a free mft record in the mft bitmap attribute on the ntfs volume
1112 * @vol.
1113 *
1114 * If @base_ni is NULL start the search at the default allocator position.
1115 *
1116 * If @base_ni is not NULL start the search at the mft record after the base
1117 * mft record @base_ni.
1118 *
1119 * Return the free mft record on success and -errno on error.  An error code of
1120 * -ENOSPC means that there are no free mft records in the currently
1121 * initialized mft bitmap.
1122 *
1123 * Locking: Caller must hold vol->mftbmp_lock for writing.
1124 */
1125static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol,
1126		ntfs_inode *base_ni)
1127{
1128	s64 pass_end, ll, data_pos, pass_start, ofs, bit;
1129	unsigned long flags;
1130	struct address_space *mftbmp_mapping;
1131	u8 *buf, *byte;
1132	struct page *page;
1133	unsigned int page_ofs, size;
1134	u8 pass, b;
1135
1136	ntfs_debug("Searching for free mft record in the currently "
1137			"initialized mft bitmap.");
1138	mftbmp_mapping = vol->mftbmp_ino->i_mapping;
1139	/*
1140	 * Set the end of the pass making sure we do not overflow the mft
1141	 * bitmap.
1142	 */
1143	read_lock_irqsave(&NTFS_I(vol->mft_ino)->size_lock, flags);
1144	pass_end = NTFS_I(vol->mft_ino)->allocated_size >>
1145			vol->mft_record_size_bits;
1146	read_unlock_irqrestore(&NTFS_I(vol->mft_ino)->size_lock, flags);
1147	read_lock_irqsave(&NTFS_I(vol->mftbmp_ino)->size_lock, flags);
1148	ll = NTFS_I(vol->mftbmp_ino)->initialized_size << 3;
1149	read_unlock_irqrestore(&NTFS_I(vol->mftbmp_ino)->size_lock, flags);
1150	if (pass_end > ll)
1151		pass_end = ll;
1152	pass = 1;
1153	if (!base_ni)
1154		data_pos = vol->mft_data_pos;
1155	else
1156		data_pos = base_ni->mft_no + 1;
1157	if (data_pos < 24)
1158		data_pos = 24;
1159	if (data_pos >= pass_end) {
1160		data_pos = 24;
1161		pass = 2;
1162		/* This happens on a freshly formatted volume. */
1163		if (data_pos >= pass_end)
1164			return -ENOSPC;
1165	}
1166	pass_start = data_pos;
1167	ntfs_debug("Starting bitmap search: pass %u, pass_start 0x%llx, "
1168			"pass_end 0x%llx, data_pos 0x%llx.", pass,
1169			(long long)pass_start, (long long)pass_end,
1170			(long long)data_pos);
1171	/* Loop until a free mft record is found. */
1172	for (; pass <= 2;) {
1173		/* Cap size to pass_end. */
1174		ofs = data_pos >> 3;
1175		page_ofs = ofs & ~PAGE_CACHE_MASK;
1176		size = PAGE_CACHE_SIZE - page_ofs;
1177		ll = ((pass_end + 7) >> 3) - ofs;
1178		if (size > ll)
1179			size = ll;
1180		size <<= 3;
1181		/*
1182		 * If we are still within the active pass, search the next page
1183		 * for a zero bit.
1184		 */
1185		if (size) {
1186			page = ntfs_map_page(mftbmp_mapping,
1187					ofs >> PAGE_CACHE_SHIFT);
1188			if (unlikely(IS_ERR(page))) {
1189				ntfs_error(vol->sb, "Failed to read mft "
1190						"bitmap, aborting.");
1191				return PTR_ERR(page);
1192			}
1193			buf = (u8*)page_address(page) + page_ofs;
1194			bit = data_pos & 7;
1195			data_pos &= ~7ull;
1196			ntfs_debug("Before inner for loop: size 0x%x, "
1197					"data_pos 0x%llx, bit 0x%llx", size,
1198					(long long)data_pos, (long long)bit);
1199			for (; bit < size && data_pos + bit < pass_end;
1200					bit &= ~7ull, bit += 8) {
1201				byte = buf + (bit >> 3);
1202				if (*byte == 0xff)
1203					continue;
1204				b = ffz((unsigned long)*byte);
1205				if (b < 8 && b >= (bit & 7)) {
1206					ll = data_pos + (bit & ~7ull) + b;
1207					if (unlikely(ll > (1ll << 32))) {
1208						ntfs_unmap_page(page);
1209						return -ENOSPC;
1210					}
1211					*byte |= 1 << b;
1212					flush_dcache_page(page);
1213					set_page_dirty(page);
1214					ntfs_unmap_page(page);
1215					ntfs_debug("Done.  (Found and "
1216							"allocated mft record "
1217							"0x%llx.)",
1218							(long long)ll);
1219					return ll;
1220				}
1221			}
1222			ntfs_debug("After inner for loop: size 0x%x, "
1223					"data_pos 0x%llx, bit 0x%llx", size,
1224					(long long)data_pos, (long long)bit);
1225			data_pos += size;
1226			ntfs_unmap_page(page);
1227			/*
1228			 * If the end of the pass has not been reached yet,
1229			 * continue searching the mft bitmap for a zero bit.
1230			 */
1231			if (data_pos < pass_end)
1232				continue;
1233		}
1234		/* Do the next pass. */
1235		if (++pass == 2) {
1236			/*
1237			 * Starting the second pass, in which we scan the first
1238			 * part of the zone which we omitted earlier.
1239			 */
1240			pass_end = pass_start;
1241			data_pos = pass_start = 24;
1242			ntfs_debug("pass %i, pass_start 0x%llx, pass_end "
1243					"0x%llx.", pass, (long long)pass_start,
1244					(long long)pass_end);
1245			if (data_pos >= pass_end)
1246				break;
1247		}
1248	}
1249	/* No free mft records in currently initialized mft bitmap. */
1250	ntfs_debug("Done.  (No free mft records left in currently initialized "
1251			"mft bitmap.)");
1252	return -ENOSPC;
1253}
1254
1255/**
1256 * ntfs_mft_bitmap_extend_allocation_nolock - extend mft bitmap by a cluster
1257 * @vol:	volume on which to extend the mft bitmap attribute
1258 *
1259 * Extend the mft bitmap attribute on the ntfs volume @vol by one cluster.
1260 *
1261 * Note: Only changes allocated_size, i.e. does not touch initialized_size or
1262 * data_size.
1263 *
1264 * Return 0 on success and -errno on error.
1265 *
1266 * Locking: - Caller must hold vol->mftbmp_lock for writing.
1267 *	    - This function takes NTFS_I(vol->mftbmp_ino)->runlist.lock for
1268 *	      writing and releases it before returning.
1269 *	    - This function takes vol->lcnbmp_lock for writing and releases it
1270 *	      before returning.
1271 */
1272static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
1273{
1274	LCN lcn;
1275	s64 ll;
1276	unsigned long flags;
1277	struct page *page;
1278	ntfs_inode *mft_ni, *mftbmp_ni;
1279	runlist_element *rl, *rl2 = NULL;
1280	ntfs_attr_search_ctx *ctx = NULL;
1281	MFT_RECORD *mrec;
1282	ATTR_RECORD *a = NULL;
1283	int ret, mp_size;
1284	u32 old_alen = 0;
1285	u8 *b, tb;
1286	struct {
1287		u8 added_cluster:1;
1288		u8 added_run:1;
1289		u8 mp_rebuilt:1;
1290	} status = { 0, 0, 0 };
1291
1292	ntfs_debug("Extending mft bitmap allocation.");
1293	mft_ni = NTFS_I(vol->mft_ino);
1294	mftbmp_ni = NTFS_I(vol->mftbmp_ino);
1295	/*
1296	 * Determine the last lcn of the mft bitmap.  The allocated size of the
1297	 * mft bitmap cannot be zero so we are ok to do this.
1298	 */
1299	down_write(&mftbmp_ni->runlist.lock);
1300	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
1301	ll = mftbmp_ni->allocated_size;
1302	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1303	rl = ntfs_attr_find_vcn_nolock(mftbmp_ni,
1304			(ll - 1) >> vol->cluster_size_bits, NULL);
1305	if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
1306		up_write(&mftbmp_ni->runlist.lock);
1307		ntfs_error(vol->sb, "Failed to determine last allocated "
1308				"cluster of mft bitmap attribute.");
1309		if (!IS_ERR(rl))
1310			ret = -EIO;
1311		else
1312			ret = PTR_ERR(rl);
1313		return ret;
1314	}
1315	lcn = rl->lcn + rl->length;
1316	ntfs_debug("Last lcn of mft bitmap attribute is 0x%llx.",
1317			(long long)lcn);
1318	/*
1319	 * Attempt to get the cluster following the last allocated cluster by
1320	 * hand as it may be in the MFT zone so the allocator would not give it
1321	 * to us.
1322	 */
1323	ll = lcn >> 3;
1324	page = ntfs_map_page(vol->lcnbmp_ino->i_mapping,
1325			ll >> PAGE_CACHE_SHIFT);
1326	if (IS_ERR(page)) {
1327		up_write(&mftbmp_ni->runlist.lock);
1328		ntfs_error(vol->sb, "Failed to read from lcn bitmap.");
1329		return PTR_ERR(page);
1330	}
1331	b = (u8*)page_address(page) + (ll & ~PAGE_CACHE_MASK);
1332	tb = 1 << (lcn & 7ull);
1333	down_write(&vol->lcnbmp_lock);
1334	if (*b != 0xff && !(*b & tb)) {
1335		/* Next cluster is free, allocate it. */
1336		*b |= tb;
1337		flush_dcache_page(page);
1338		set_page_dirty(page);
1339		up_write(&vol->lcnbmp_lock);
1340		ntfs_unmap_page(page);
1341		/* Update the mft bitmap runlist. */
1342		rl->length++;
1343		rl[1].vcn++;
1344		status.added_cluster = 1;
1345		ntfs_debug("Appending one cluster to mft bitmap.");
1346	} else {
1347		up_write(&vol->lcnbmp_lock);
1348		ntfs_unmap_page(page);
1349		/* Allocate a cluster from the DATA_ZONE. */
1350		rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE,
1351				true);
1352		if (IS_ERR(rl2)) {
1353			up_write(&mftbmp_ni->runlist.lock);
1354			ntfs_error(vol->sb, "Failed to allocate a cluster for "
1355					"the mft bitmap.");
1356			return PTR_ERR(rl2);
1357		}
1358		rl = ntfs_runlists_merge(mftbmp_ni->runlist.rl, rl2);
1359		if (IS_ERR(rl)) {
1360			up_write(&mftbmp_ni->runlist.lock);
1361			ntfs_error(vol->sb, "Failed to merge runlists for mft "
1362					"bitmap.");
1363			if (ntfs_cluster_free_from_rl(vol, rl2)) {
1364				ntfs_error(vol->sb, "Failed to dealocate "
1365						"allocated cluster.%s", es);
1366				NVolSetErrors(vol);
1367			}
1368			ntfs_free(rl2);
1369			return PTR_ERR(rl);
1370		}
1371		mftbmp_ni->runlist.rl = rl;
1372		status.added_run = 1;
1373		ntfs_debug("Adding one run to mft bitmap.");
1374		/* Find the last run in the new runlist. */
1375		for (; rl[1].length; rl++)
1376			;
1377	}
1378	/*
1379	 * Update the attribute record as well.  Note: @rl is the last
1380	 * (non-terminator) runlist element of mft bitmap.
1381	 */
1382	mrec = map_mft_record(mft_ni);
1383	if (IS_ERR(mrec)) {
1384		ntfs_error(vol->sb, "Failed to map mft record.");
1385		ret = PTR_ERR(mrec);
1386		goto undo_alloc;
1387	}
1388	ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
1389	if (unlikely(!ctx)) {
1390		ntfs_error(vol->sb, "Failed to get search context.");
1391		ret = -ENOMEM;
1392		goto undo_alloc;
1393	}
1394	ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1395			mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL,
1396			0, ctx);
1397	if (unlikely(ret)) {
1398		ntfs_error(vol->sb, "Failed to find last attribute extent of "
1399				"mft bitmap attribute.");
1400		if (ret == -ENOENT)
1401			ret = -EIO;
1402		goto undo_alloc;
1403	}
1404	a = ctx->attr;
1405	ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
1406	/* Search back for the previous last allocated cluster of mft bitmap. */
1407	for (rl2 = rl; rl2 > mftbmp_ni->runlist.rl; rl2--) {
1408		if (ll >= rl2->vcn)
1409			break;
1410	}
1411	BUG_ON(ll < rl2->vcn);
1412	BUG_ON(ll >= rl2->vcn + rl2->length);
1413	/* Get the size for the new mapping pairs array for this extent. */
1414	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
1415	if (unlikely(mp_size <= 0)) {
1416		ntfs_error(vol->sb, "Get size for mapping pairs failed for "
1417				"mft bitmap attribute extent.");
1418		ret = mp_size;
1419		if (!ret)
1420			ret = -EIO;
1421		goto undo_alloc;
1422	}
1423	/* Expand the attribute record if necessary. */
1424	old_alen = le32_to_cpu(a->length);
1425	ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size +
1426			le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
1427	if (unlikely(ret)) {
1428		if (ret != -ENOSPC) {
1429			ntfs_error(vol->sb, "Failed to resize attribute "
1430					"record for mft bitmap attribute.");
1431			goto undo_alloc;
1432		}
1433		// TODO: Deal with this by moving this extent to a new mft
1434		// record or by starting a new extent in a new mft record or by
1435		// moving other attributes out of this mft record.
1436		// Note: It will need to be a special mft record and if none of
1437		// those are available it gets rather complicated...
1438		ntfs_error(vol->sb, "Not enough space in this mft record to "
1439				"accomodate extended mft bitmap attribute "
1440				"extent.  Cannot handle this yet.");
1441		ret = -EOPNOTSUPP;
1442		goto undo_alloc;
1443	}
1444	status.mp_rebuilt = 1;
1445	/* Generate the mapping pairs array directly into the attr record. */
1446	ret = ntfs_mapping_pairs_build(vol, (u8*)a +
1447			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
1448			mp_size, rl2, ll, -1, NULL);
1449	if (unlikely(ret)) {
1450		ntfs_error(vol->sb, "Failed to build mapping pairs array for "
1451				"mft bitmap attribute.");
1452		goto undo_alloc;
1453	}
1454	/* Update the highest_vcn. */
1455	a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1);
1456	/*
1457	 * We now have extended the mft bitmap allocated_size by one cluster.
1458	 * Reflect this in the ntfs_inode structure and the attribute record.
1459	 */
1460	if (a->data.non_resident.lowest_vcn) {
1461		/*
1462		 * We are not in the first attribute extent, switch to it, but
1463		 * first ensure the changes will make it to disk later.
1464		 */
1465		flush_dcache_mft_record_page(ctx->ntfs_ino);
1466		mark_mft_record_dirty(ctx->ntfs_ino);
1467		ntfs_attr_reinit_search_ctx(ctx);
1468		ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1469				mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL,
1470				0, ctx);
1471		if (unlikely(ret)) {
1472			ntfs_error(vol->sb, "Failed to find first attribute "
1473					"extent of mft bitmap attribute.");
1474			goto restore_undo_alloc;
1475		}
1476		a = ctx->attr;
1477	}
1478	write_lock_irqsave(&mftbmp_ni->size_lock, flags);
1479	mftbmp_ni->allocated_size += vol->cluster_size;
1480	a->data.non_resident.allocated_size =
1481			cpu_to_sle64(mftbmp_ni->allocated_size);
1482	write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1483	/* Ensure the changes make it to disk. */
1484	flush_dcache_mft_record_page(ctx->ntfs_ino);
1485	mark_mft_record_dirty(ctx->ntfs_ino);
1486	ntfs_attr_put_search_ctx(ctx);
1487	unmap_mft_record(mft_ni);
1488	up_write(&mftbmp_ni->runlist.lock);
1489	ntfs_debug("Done.");
1490	return 0;
1491restore_undo_alloc:
1492	ntfs_attr_reinit_search_ctx(ctx);
1493	if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1494			mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL,
1495			0, ctx)) {
1496		ntfs_error(vol->sb, "Failed to find last attribute extent of "
1497				"mft bitmap attribute.%s", es);
1498		write_lock_irqsave(&mftbmp_ni->size_lock, flags);
1499		mftbmp_ni->allocated_size += vol->cluster_size;
1500		write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1501		ntfs_attr_put_search_ctx(ctx);
1502		unmap_mft_record(mft_ni);
1503		up_write(&mftbmp_ni->runlist.lock);
1504		/*
1505		 * The only thing that is now wrong is ->allocated_size of the
1506		 * base attribute extent which chkdsk should be able to fix.
1507		 */
1508		NVolSetErrors(vol);
1509		return ret;
1510	}
1511	a = ctx->attr;
1512	a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 2);
1513undo_alloc:
1514	if (status.added_cluster) {
1515		/* Truncate the last run in the runlist by one cluster. */
1516		rl->length--;
1517		rl[1].vcn--;
1518	} else if (status.added_run) {
1519		lcn = rl->lcn;
1520		/* Remove the last run from the runlist. */
1521		rl->lcn = rl[1].lcn;
1522		rl->length = 0;
1523	}
1524	/* Deallocate the cluster. */
1525	down_write(&vol->lcnbmp_lock);
1526	if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
1527		ntfs_error(vol->sb, "Failed to free allocated cluster.%s", es);
1528		NVolSetErrors(vol);
1529	}
1530	up_write(&vol->lcnbmp_lock);
1531	if (status.mp_rebuilt) {
1532		if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
1533				a->data.non_resident.mapping_pairs_offset),
1534				old_alen - le16_to_cpu(
1535				a->data.non_resident.mapping_pairs_offset),
1536				rl2, ll, -1, NULL)) {
1537			ntfs_error(vol->sb, "Failed to restore mapping pairs "
1538					"array.%s", es);
1539			NVolSetErrors(vol);
1540		}
1541		if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
1542			ntfs_error(vol->sb, "Failed to restore attribute "
1543					"record.%s", es);
1544			NVolSetErrors(vol);
1545		}
1546		flush_dcache_mft_record_page(ctx->ntfs_ino);
1547		mark_mft_record_dirty(ctx->ntfs_ino);
1548	}
1549	if (ctx)
1550		ntfs_attr_put_search_ctx(ctx);
1551	if (!IS_ERR(mrec))
1552		unmap_mft_record(mft_ni);
1553	up_write(&mftbmp_ni->runlist.lock);
1554	return ret;
1555}
1556
1557/**
1558 * ntfs_mft_bitmap_extend_initialized_nolock - extend mftbmp initialized data
1559 * @vol:	volume on which to extend the mft bitmap attribute
1560 *
1561 * Extend the initialized portion of the mft bitmap attribute on the ntfs
1562 * volume @vol by 8 bytes.
1563 *
1564 * Note:  Only changes initialized_size and data_size, i.e. requires that
1565 * allocated_size is big enough to fit the new initialized_size.
1566 *
1567 * Return 0 on success and -error on error.
1568 *
1569 * Locking: Caller must hold vol->mftbmp_lock for writing.
1570 */
1571static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol)
1572{
1573	s64 old_data_size, old_initialized_size;
1574	unsigned long flags;
1575	struct inode *mftbmp_vi;
1576	ntfs_inode *mft_ni, *mftbmp_ni;
1577	ntfs_attr_search_ctx *ctx;
1578	MFT_RECORD *mrec;
1579	ATTR_RECORD *a;
1580	int ret;
1581
1582	ntfs_debug("Extending mft bitmap initiailized (and data) size.");
1583	mft_ni = NTFS_I(vol->mft_ino);
1584	mftbmp_vi = vol->mftbmp_ino;
1585	mftbmp_ni = NTFS_I(mftbmp_vi);
1586	/* Get the attribute record. */
1587	mrec = map_mft_record(mft_ni);
1588	if (IS_ERR(mrec)) {
1589		ntfs_error(vol->sb, "Failed to map mft record.");
1590		return PTR_ERR(mrec);
1591	}
1592	ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
1593	if (unlikely(!ctx)) {
1594		ntfs_error(vol->sb, "Failed to get search context.");
1595		ret = -ENOMEM;
1596		goto unm_err_out;
1597	}
1598	ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1599			mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx);
1600	if (unlikely(ret)) {
1601		ntfs_error(vol->sb, "Failed to find first attribute extent of "
1602				"mft bitmap attribute.");
1603		if (ret == -ENOENT)
1604			ret = -EIO;
1605		goto put_err_out;
1606	}
1607	a = ctx->attr;
1608	write_lock_irqsave(&mftbmp_ni->size_lock, flags);
1609	old_data_size = i_size_read(mftbmp_vi);
1610	old_initialized_size = mftbmp_ni->initialized_size;
1611	/*
1612	 * We can simply update the initialized_size before filling the space
1613	 * with zeroes because the caller is holding the mft bitmap lock for
1614	 * writing which ensures that no one else is trying to access the data.
1615	 */
1616	mftbmp_ni->initialized_size += 8;
1617	a->data.non_resident.initialized_size =
1618			cpu_to_sle64(mftbmp_ni->initialized_size);
1619	if (mftbmp_ni->initialized_size > old_data_size) {
1620		i_size_write(mftbmp_vi, mftbmp_ni->initialized_size);
1621		a->data.non_resident.data_size =
1622				cpu_to_sle64(mftbmp_ni->initialized_size);
1623	}
1624	write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1625	/* Ensure the changes make it to disk. */
1626	flush_dcache_mft_record_page(ctx->ntfs_ino);
1627	mark_mft_record_dirty(ctx->ntfs_ino);
1628	ntfs_attr_put_search_ctx(ctx);
1629	unmap_mft_record(mft_ni);
1630	/* Initialize the mft bitmap attribute value with zeroes. */
1631	ret = ntfs_attr_set(mftbmp_ni, old_initialized_size, 8, 0);
1632	if (likely(!ret)) {
1633		ntfs_debug("Done.  (Wrote eight initialized bytes to mft "
1634				"bitmap.");
1635		return 0;
1636	}
1637	ntfs_error(vol->sb, "Failed to write to mft bitmap.");
1638	/* Try to recover from the error. */
1639	mrec = map_mft_record(mft_ni);
1640	if (IS_ERR(mrec)) {
1641		ntfs_error(vol->sb, "Failed to map mft record.%s", es);
1642		NVolSetErrors(vol);
1643		return ret;
1644	}
1645	ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
1646	if (unlikely(!ctx)) {
1647		ntfs_error(vol->sb, "Failed to get search context.%s", es);
1648		NVolSetErrors(vol);
1649		goto unm_err_out;
1650	}
1651	if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
1652			mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx)) {
1653		ntfs_error(vol->sb, "Failed to find first attribute extent of "
1654				"mft bitmap attribute.%s", es);
1655		NVolSetErrors(vol);
1656put_err_out:
1657		ntfs_attr_put_search_ctx(ctx);
1658unm_err_out:
1659		unmap_mft_record(mft_ni);
1660		goto err_out;
1661	}
1662	a = ctx->attr;
1663	write_lock_irqsave(&mftbmp_ni->size_lock, flags);
1664	mftbmp_ni->initialized_size = old_initialized_size;
1665	a->data.non_resident.initialized_size =
1666			cpu_to_sle64(old_initialized_size);
1667	if (i_size_read(mftbmp_vi) != old_data_size) {
1668		i_size_write(mftbmp_vi, old_data_size);
1669		a->data.non_resident.data_size = cpu_to_sle64(old_data_size);
1670	}
1671	write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1672	flush_dcache_mft_record_page(ctx->ntfs_ino);
1673	mark_mft_record_dirty(ctx->ntfs_ino);
1674	ntfs_attr_put_search_ctx(ctx);
1675	unmap_mft_record(mft_ni);
1676#ifdef DEBUG
1677	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
1678	ntfs_debug("Restored status of mftbmp: allocated_size 0x%llx, "
1679			"data_size 0x%llx, initialized_size 0x%llx.",
1680			(long long)mftbmp_ni->allocated_size,
1681			(long long)i_size_read(mftbmp_vi),
1682			(long long)mftbmp_ni->initialized_size);
1683	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1684#endif /* DEBUG */
1685err_out:
1686	return ret;
1687}
1688
1689/**
1690 * ntfs_mft_data_extend_allocation_nolock - extend mft data attribute
1691 * @vol:	volume on which to extend the mft data attribute
1692 *
1693 * Extend the mft data attribute on the ntfs volume @vol by 16 mft records
1694 * worth of clusters or if not enough space for this by one mft record worth
1695 * of clusters.
1696 *
1697 * Note:  Only changes allocated_size, i.e. does not touch initialized_size or
1698 * data_size.
1699 *
1700 * Return 0 on success and -errno on error.
1701 *
1702 * Locking: - Caller must hold vol->mftbmp_lock for writing.
1703 *	    - This function takes NTFS_I(vol->mft_ino)->runlist.lock for
1704 *	      writing and releases it before returning.
1705 *	    - This function calls functions which take vol->lcnbmp_lock for
1706 *	      writing and release it before returning.
1707 */
1708static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
1709{
1710	LCN lcn;
1711	VCN old_last_vcn;
1712	s64 min_nr, nr, ll;
1713	unsigned long flags;
1714	ntfs_inode *mft_ni;
1715	runlist_element *rl, *rl2;
1716	ntfs_attr_search_ctx *ctx = NULL;
1717	MFT_RECORD *mrec;
1718	ATTR_RECORD *a = NULL;
1719	int ret, mp_size;
1720	u32 old_alen = 0;
1721	bool mp_rebuilt = false;
1722
1723	ntfs_debug("Extending mft data allocation.");
1724	mft_ni = NTFS_I(vol->mft_ino);
1725	/*
1726	 * Determine the preferred allocation location, i.e. the last lcn of
1727	 * the mft data attribute.  The allocated size of the mft data
1728	 * attribute cannot be zero so we are ok to do this.
1729	 */
1730	down_write(&mft_ni->runlist.lock);
1731	read_lock_irqsave(&mft_ni->size_lock, flags);
1732	ll = mft_ni->allocated_size;
1733	read_unlock_irqrestore(&mft_ni->size_lock, flags);
1734	rl = ntfs_attr_find_vcn_nolock(mft_ni,
1735			(ll - 1) >> vol->cluster_size_bits, NULL);
1736	if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
1737		up_write(&mft_ni->runlist.lock);
1738		ntfs_error(vol->sb, "Failed to determine last allocated "
1739				"cluster of mft data attribute.");
1740		if (!IS_ERR(rl))
1741			ret = -EIO;
1742		else
1743			ret = PTR_ERR(rl);
1744		return ret;
1745	}
1746	lcn = rl->lcn + rl->length;
1747	ntfs_debug("Last lcn of mft data attribute is 0x%llx.", (long long)lcn);
1748	/* Minimum allocation is one mft record worth of clusters. */
1749	min_nr = vol->mft_record_size >> vol->cluster_size_bits;
1750	if (!min_nr)
1751		min_nr = 1;
1752	/* Want to allocate 16 mft records worth of clusters. */
1753	nr = vol->mft_record_size << 4 >> vol->cluster_size_bits;
1754	if (!nr)
1755		nr = min_nr;
1756	/* Ensure we do not go above 2^32-1 mft records. */
1757	read_lock_irqsave(&mft_ni->size_lock, flags);
1758	ll = mft_ni->allocated_size;
1759	read_unlock_irqrestore(&mft_ni->size_lock, flags);
1760	if (unlikely((ll + (nr << vol->cluster_size_bits)) >>
1761			vol->mft_record_size_bits >= (1ll << 32))) {
1762		nr = min_nr;
1763		if (unlikely((ll + (nr << vol->cluster_size_bits)) >>
1764				vol->mft_record_size_bits >= (1ll << 32))) {
1765			ntfs_warning(vol->sb, "Cannot allocate mft record "
1766					"because the maximum number of inodes "
1767					"(2^32) has already been reached.");
1768			up_write(&mft_ni->runlist.lock);
1769			return -ENOSPC;
1770		}
1771	}
1772	ntfs_debug("Trying mft data allocation with %s cluster count %lli.",
1773			nr > min_nr ? "default" : "minimal", (long long)nr);
1774	old_last_vcn = rl[1].vcn;
1775	do {
1776		rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE,
1777				true);
1778		if (likely(!IS_ERR(rl2)))
1779			break;
1780		if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) {
1781			ntfs_error(vol->sb, "Failed to allocate the minimal "
1782					"number of clusters (%lli) for the "
1783					"mft data attribute.", (long long)nr);
1784			up_write(&mft_ni->runlist.lock);
1785			return PTR_ERR(rl2);
1786		}
1787		/*
1788		 * There is not enough space to do the allocation, but there
1789		 * might be enough space to do a minimal allocation so try that
1790		 * before failing.
1791		 */
1792		nr = min_nr;
1793		ntfs_debug("Retrying mft data allocation with minimal cluster "
1794				"count %lli.", (long long)nr);
1795	} while (1);
1796	rl = ntfs_runlists_merge(mft_ni->runlist.rl, rl2);
1797	if (IS_ERR(rl)) {
1798		up_write(&mft_ni->runlist.lock);
1799		ntfs_error(vol->sb, "Failed to merge runlists for mft data "
1800				"attribute.");
1801		if (ntfs_cluster_free_from_rl(vol, rl2)) {
1802			ntfs_error(vol->sb, "Failed to dealocate clusters "
1803					"from the mft data attribute.%s", es);
1804			NVolSetErrors(vol);
1805		}
1806		ntfs_free(rl2);
1807		return PTR_ERR(rl);
1808	}
1809	mft_ni->runlist.rl = rl;
1810	ntfs_debug("Allocated %lli clusters.", (long long)nr);
1811	/* Find the last run in the new runlist. */
1812	for (; rl[1].length; rl++)
1813		;
1814	/* Update the attribute record as well. */
1815	mrec = map_mft_record(mft_ni);
1816	if (IS_ERR(mrec)) {
1817		ntfs_error(vol->sb, "Failed to map mft record.");
1818		ret = PTR_ERR(mrec);
1819		goto undo_alloc;
1820	}
1821	ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
1822	if (unlikely(!ctx)) {
1823		ntfs_error(vol->sb, "Failed to get search context.");
1824		ret = -ENOMEM;
1825		goto undo_alloc;
1826	}
1827	ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
1828			CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx);
1829	if (unlikely(ret)) {
1830		ntfs_error(vol->sb, "Failed to find last attribute extent of "
1831				"mft data attribute.");
1832		if (ret == -ENOENT)
1833			ret = -EIO;
1834		goto undo_alloc;
1835	}
1836	a = ctx->attr;
1837	ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
1838	/* Search back for the previous last allocated cluster of mft bitmap. */
1839	for (rl2 = rl; rl2 > mft_ni->runlist.rl; rl2--) {
1840		if (ll >= rl2->vcn)
1841			break;
1842	}
1843	BUG_ON(ll < rl2->vcn);
1844	BUG_ON(ll >= rl2->vcn + rl2->length);
1845	/* Get the size for the new mapping pairs array for this extent. */
1846	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
1847	if (unlikely(mp_size <= 0)) {
1848		ntfs_error(vol->sb, "Get size for mapping pairs failed for "
1849				"mft data attribute extent.");
1850		ret = mp_size;
1851		if (!ret)
1852			ret = -EIO;
1853		goto undo_alloc;
1854	}
1855	/* Expand the attribute record if necessary. */
1856	old_alen = le32_to_cpu(a->length);
1857	ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size +
1858			le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
1859	if (unlikely(ret)) {
1860		if (ret != -ENOSPC) {
1861			ntfs_error(vol->sb, "Failed to resize attribute "
1862					"record for mft data attribute.");
1863			goto undo_alloc;
1864		}
1865		// TODO: Deal with this by moving this extent to a new mft
1866		// record or by starting a new extent in a new mft record or by
1867		// moving other attributes out of this mft record.
1868		// Note: Use the special reserved mft records and ensure that
1869		// this extent is not required to find the mft record in
1870		// question.  If no free special records left we would need to
1871		// move an existing record away, insert ours in its place, and
1872		// then place the moved record into the newly allocated space
1873		// and we would then need to update all references to this mft
1874		// record appropriately.  This is rather complicated...
1875		ntfs_error(vol->sb, "Not enough space in this mft record to "
1876				"accomodate extended mft data attribute "
1877				"extent.  Cannot handle this yet.");
1878		ret = -EOPNOTSUPP;
1879		goto undo_alloc;
1880	}
1881	mp_rebuilt = true;
1882	/* Generate the mapping pairs array directly into the attr record. */
1883	ret = ntfs_mapping_pairs_build(vol, (u8*)a +
1884			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
1885			mp_size, rl2, ll, -1, NULL);
1886	if (unlikely(ret)) {
1887		ntfs_error(vol->sb, "Failed to build mapping pairs array of "
1888				"mft data attribute.");
1889		goto undo_alloc;
1890	}
1891	/* Update the highest_vcn. */
1892	a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1);
1893	/*
1894	 * We now have extended the mft data allocated_size by nr clusters.
1895	 * Reflect this in the ntfs_inode structure and the attribute record.
1896	 * @rl is the last (non-terminator) runlist element of mft data
1897	 * attribute.
1898	 */
1899	if (a->data.non_resident.lowest_vcn) {
1900		/*
1901		 * We are not in the first attribute extent, switch to it, but
1902		 * first ensure the changes will make it to disk later.
1903		 */
1904		flush_dcache_mft_record_page(ctx->ntfs_ino);
1905		mark_mft_record_dirty(ctx->ntfs_ino);
1906		ntfs_attr_reinit_search_ctx(ctx);
1907		ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name,
1908				mft_ni->name_len, CASE_SENSITIVE, 0, NULL, 0,
1909				ctx);
1910		if (unlikely(ret)) {
1911			ntfs_error(vol->sb, "Failed to find first attribute "
1912					"extent of mft data attribute.");
1913			goto restore_undo_alloc;
1914		}
1915		a = ctx->attr;
1916	}
1917	write_lock_irqsave(&mft_ni->size_lock, flags);
1918	mft_ni->allocated_size += nr << vol->cluster_size_bits;
1919	a->data.non_resident.allocated_size =
1920			cpu_to_sle64(mft_ni->allocated_size);
1921	write_unlock_irqrestore(&mft_ni->size_lock, flags);
1922	/* Ensure the changes make it to disk. */
1923	flush_dcache_mft_record_page(ctx->ntfs_ino);
1924	mark_mft_record_dirty(ctx->ntfs_ino);
1925	ntfs_attr_put_search_ctx(ctx);
1926	unmap_mft_record(mft_ni);
1927	up_write(&mft_ni->runlist.lock);
1928	ntfs_debug("Done.");
1929	return 0;
1930restore_undo_alloc:
1931	ntfs_attr_reinit_search_ctx(ctx);
1932	if (ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
1933			CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx)) {
1934		ntfs_error(vol->sb, "Failed to find last attribute extent of "
1935				"mft data attribute.%s", es);
1936		write_lock_irqsave(&mft_ni->size_lock, flags);
1937		mft_ni->allocated_size += nr << vol->cluster_size_bits;
1938		write_unlock_irqrestore(&mft_ni->size_lock, flags);
1939		ntfs_attr_put_search_ctx(ctx);
1940		unmap_mft_record(mft_ni);
1941		up_write(&mft_ni->runlist.lock);
1942		/*
1943		 * The only thing that is now wrong is ->allocated_size of the
1944		 * base attribute extent which chkdsk should be able to fix.
1945		 */
1946		NVolSetErrors(vol);
1947		return ret;
1948	}
1949	ctx->attr->data.non_resident.highest_vcn =
1950			cpu_to_sle64(old_last_vcn - 1);
1951undo_alloc:
1952	if (ntfs_cluster_free(mft_ni, old_last_vcn, -1, ctx) < 0) {
1953		ntfs_error(vol->sb, "Failed to free clusters from mft data "
1954				"attribute.%s", es);
1955		NVolSetErrors(vol);
1956	}
1957	a = ctx->attr;
1958	if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) {
1959		ntfs_error(vol->sb, "Failed to truncate mft data attribute "
1960				"runlist.%s", es);
1961		NVolSetErrors(vol);
1962	}
1963	if (mp_rebuilt && !IS_ERR(ctx->mrec)) {
1964		if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
1965				a->data.non_resident.mapping_pairs_offset),
1966				old_alen - le16_to_cpu(
1967				a->data.non_resident.mapping_pairs_offset),
1968				rl2, ll, -1, NULL)) {
1969			ntfs_error(vol->sb, "Failed to restore mapping pairs "
1970					"array.%s", es);
1971			NVolSetErrors(vol);
1972		}
1973		if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
1974			ntfs_error(vol->sb, "Failed to restore attribute "
1975					"record.%s", es);
1976			NVolSetErrors(vol);
1977		}
1978		flush_dcache_mft_record_page(ctx->ntfs_ino);
1979		mark_mft_record_dirty(ctx->ntfs_ino);
1980	} else if (IS_ERR(ctx->mrec)) {
1981		ntfs_error(vol->sb, "Failed to restore attribute search "
1982				"context.%s", es);
1983		NVolSetErrors(vol);
1984	}
1985	if (ctx)
1986		ntfs_attr_put_search_ctx(ctx);
1987	if (!IS_ERR(mrec))
1988		unmap_mft_record(mft_ni);
1989	up_write(&mft_ni->runlist.lock);
1990	return ret;
1991}
1992
1993/**
1994 * ntfs_mft_record_layout - layout an mft record into a memory buffer
1995 * @vol:	volume to which the mft record will belong
1996 * @mft_no:	mft reference specifying the mft record number
1997 * @m:		destination buffer of size >= @vol->mft_record_size bytes
1998 *
1999 * Layout an empty, unused mft record with the mft record number @mft_no into
2000 * the buffer @m.  The volume @vol is needed because the mft record structure
2001 * was modified in NTFS 3.1 so we need to know which volume version this mft
2002 * record will be used on.
2003 *
2004 * Return 0 on success and -errno on error.
2005 */
2006static int ntfs_mft_record_layout(const ntfs_volume *vol, const s64 mft_no,
2007		MFT_RECORD *m)
2008{
2009	ATTR_RECORD *a;
2010
2011	ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
2012	if (mft_no >= (1ll << 32)) {
2013		ntfs_error(vol->sb, "Mft record number 0x%llx exceeds "
2014				"maximum of 2^32.", (long long)mft_no);
2015		return -ERANGE;
2016	}
2017	/* Start by clearing the whole mft record to gives us a clean slate. */
2018	memset(m, 0, vol->mft_record_size);
2019	/* Aligned to 2-byte boundary. */
2020	if (vol->major_ver < 3 || (vol->major_ver == 3 && !vol->minor_ver))
2021		m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD_OLD) + 1) & ~1);
2022	else {
2023		m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD) + 1) & ~1);
2024		/*
2025		 * Set the NTFS 3.1+ specific fields while we know that the
2026		 * volume version is 3.1+.
2027		 */
2028		m->reserved = 0;
2029		m->mft_record_number = cpu_to_le32((u32)mft_no);
2030	}
2031	m->magic = magic_FILE;
2032	if (vol->mft_record_size >= NTFS_BLOCK_SIZE)
2033		m->usa_count = cpu_to_le16(vol->mft_record_size /
2034				NTFS_BLOCK_SIZE + 1);
2035	else {
2036		m->usa_count = cpu_to_le16(1);
2037		ntfs_warning(vol->sb, "Sector size is bigger than mft record "
2038				"size.  Setting usa_count to 1.  If chkdsk "
2039				"reports this as corruption, please email "
2040				"linux-ntfs-dev@lists.sourceforge.net stating "
2041				"that you saw this message and that the "
2042				"modified filesystem created was corrupt.  "
2043				"Thank you.");
2044	}
2045	/* Set the update sequence number to 1. */
2046	*(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = cpu_to_le16(1);
2047	m->lsn = 0;
2048	m->sequence_number = cpu_to_le16(1);
2049	m->link_count = 0;
2050	/*
2051	 * Place the attributes straight after the update sequence array,
2052	 * aligned to 8-byte boundary.
2053	 */
2054	m->attrs_offset = cpu_to_le16((le16_to_cpu(m->usa_ofs) +
2055			(le16_to_cpu(m->usa_count) << 1) + 7) & ~7);
2056	m->flags = 0;
2057	/*
2058	 * Using attrs_offset plus eight bytes (for the termination attribute).
2059	 * attrs_offset is already aligned to 8-byte boundary, so no need to
2060	 * align again.
2061	 */
2062	m->bytes_in_use = cpu_to_le32(le16_to_cpu(m->attrs_offset) + 8);
2063	m->bytes_allocated = cpu_to_le32(vol->mft_record_size);
2064	m->base_mft_record = 0;
2065	m->next_attr_instance = 0;
2066	/* Add the termination attribute. */
2067	a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset));
2068	a->type = AT_END;
2069	a->length = 0;
2070	ntfs_debug("Done.");
2071	return 0;
2072}
2073
2074/**
2075 * ntfs_mft_record_format - format an mft record on an ntfs volume
2076 * @vol:	volume on which to format the mft record
2077 * @mft_no:	mft record number to format
2078 *
2079 * Format the mft record @mft_no in $MFT/$DATA, i.e. lay out an empty, unused
2080 * mft record into the appropriate place of the mft data attribute.  This is
2081 * used when extending the mft data attribute.
2082 *
2083 * Return 0 on success and -errno on error.
2084 */
2085static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no)
2086{
2087	loff_t i_size;
2088	struct inode *mft_vi = vol->mft_ino;
2089	struct page *page;
2090	MFT_RECORD *m;
2091	pgoff_t index, end_index;
2092	unsigned int ofs;
2093	int err;
2094
2095	ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
2096	/*
2097	 * The index into the page cache and the offset within the page cache
2098	 * page of the wanted mft record.
2099	 */
2100	index = mft_no << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT;
2101	ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
2102	/* The maximum valid index into the page cache for $MFT's data. */
2103	i_size = i_size_read(mft_vi);
2104	end_index = i_size >> PAGE_CACHE_SHIFT;
2105	if (unlikely(index >= end_index)) {
2106		if (unlikely(index > end_index || ofs + vol->mft_record_size >=
2107				(i_size & ~PAGE_CACHE_MASK))) {
2108			ntfs_error(vol->sb, "Tried to format non-existing mft "
2109					"record 0x%llx.", (long long)mft_no);
2110			return -ENOENT;
2111		}
2112	}
2113	/* Read, map, and pin the page containing the mft record. */
2114	page = ntfs_map_page(mft_vi->i_mapping, index);
2115	if (unlikely(IS_ERR(page))) {
2116		ntfs_error(vol->sb, "Failed to map page containing mft record "
2117				"to format 0x%llx.", (long long)mft_no);
2118		return PTR_ERR(page);
2119	}
2120	lock_page(page);
2121	BUG_ON(!PageUptodate(page));
2122	ClearPageUptodate(page);
2123	m = (MFT_RECORD*)((u8*)page_address(page) + ofs);
2124	err = ntfs_mft_record_layout(vol, mft_no, m);
2125	if (unlikely(err)) {
2126		ntfs_error(vol->sb, "Failed to layout mft record 0x%llx.",
2127				(long long)mft_no);
2128		SetPageUptodate(page);
2129		unlock_page(page);
2130		ntfs_unmap_page(page);
2131		return err;
2132	}
2133	flush_dcache_page(page);
2134	SetPageUptodate(page);
2135	unlock_page(page);
2136	/*
2137	 * Make sure the mft record is written out to disk.  We could use
2138	 * ilookup5() to check if an inode is in icache and so on but this is
2139	 * unnecessary as ntfs_writepage() will write the dirty record anyway.
2140	 */
2141	mark_ntfs_record_dirty(page, ofs);
2142	ntfs_unmap_page(page);
2143	ntfs_debug("Done.");
2144	return 0;
2145}
2146
2147/**
2148 * ntfs_mft_record_alloc - allocate an mft record on an ntfs volume
2149 * @vol:	[IN]  volume on which to allocate the mft record
2150 * @mode:	[IN]  mode if want a file or directory, i.e. base inode or 0
2151 * @base_ni:	[IN]  open base inode if allocating an extent mft record or NULL
2152 * @mrec:	[OUT] on successful return this is the mapped mft record
2153 *
2154 * Allocate an mft record in $MFT/$DATA of an open ntfs volume @vol.
2155 *
2156 * If @base_ni is NULL make the mft record a base mft record, i.e. a file or
2157 * direvctory inode, and allocate it at the default allocator position.  In
2158 * this case @mode is the file mode as given to us by the caller.  We in
2159 * particular use @mode to distinguish whether a file or a directory is being
2160 * created (S_IFDIR(mode) and S_IFREG(mode), respectively).
2161 *
2162 * If @base_ni is not NULL make the allocated mft record an extent record,
2163 * allocate it starting at the mft record after the base mft record and attach
2164 * the allocated and opened ntfs inode to the base inode @base_ni.  In this
2165 * case @mode must be 0 as it is meaningless for extent inodes.
2166 *
2167 * You need to check the return value with IS_ERR().  If false, the function
2168 * was successful and the return value is the now opened ntfs inode of the
2169 * allocated mft record.  *@mrec is then set to the allocated, mapped, pinned,
2170 * and locked mft record.  If IS_ERR() is true, the function failed and the
2171 * error code is obtained from PTR_ERR(return value).  *@mrec is undefined in
2172 * this case.
2173 *
2174 * Allocation strategy:
2175 *
2176 * To find a free mft record, we scan the mft bitmap for a zero bit.  To
2177 * optimize this we start scanning at the place specified by @base_ni or if
2178 * @base_ni is NULL we start where we last stopped and we perform wrap around
2179 * when we reach the end.  Note, we do not try to allocate mft records below
2180 * number 24 because numbers 0 to 15 are the defined system files anyway and 16
2181 * to 24 are special in that they are used for storing extension mft records
2182 * for the $DATA attribute of $MFT.  This is required to avoid the possibility
2183 * of creating a runlist with a circular dependency which once written to disk
2184 * can never be read in again.  Windows will only use records 16 to 24 for
2185 * normal files if the volume is completely out of space.  We never use them
2186 * which means that when the volume is really out of space we cannot create any
2187 * more files while Windows can still create up to 8 small files.  We can start
2188 * doing this at some later time, it does not matter much for now.
2189 *
2190 * When scanning the mft bitmap, we only search up to the last allocated mft
2191 * record.  If there are no free records left in the range 24 to number of
2192 * allocated mft records, then we extend the $MFT/$DATA attribute in order to
2193 * create free mft records.  We extend the allocated size of $MFT/$DATA by 16
2194 * records at a time or one cluster, if cluster size is above 16kiB.  If there
2195 * is not sufficient space to do this, we try to extend by a single mft record
2196 * or one cluster, if cluster size is above the mft record size.
2197 *
2198 * No matter how many mft records we allocate, we initialize only the first
2199 * allocated mft record, incrementing mft data size and initialized size
2200 * accordingly, open an ntfs_inode for it and return it to the caller, unless
2201 * there are less than 24 mft records, in which case we allocate and initialize
2202 * mft records until we reach record 24 which we consider as the first free mft
2203 * record for use by normal files.
2204 *
2205 * If during any stage we overflow the initialized data in the mft bitmap, we
2206 * extend the initialized size (and data size) by 8 bytes, allocating another
2207 * cluster if required.  The bitmap data size has to be at least equal to the
2208 * number of mft records in the mft, but it can be bigger, in which case the
2209 * superflous bits are padded with zeroes.
2210 *
2211 * Thus, when we return successfully (IS_ERR() is false), we will have:
2212 *	- initialized / extended the mft bitmap if necessary,
2213 *	- initialized / extended the mft data if necessary,
2214 *	- set the bit corresponding to the mft record being allocated in the
2215 *	  mft bitmap,
2216 *	- opened an ntfs_inode for the allocated mft record, and we will have
2217 *	- returned the ntfs_inode as well as the allocated mapped, pinned, and
2218 *	  locked mft record.
2219 *
2220 * On error, the volume will be left in a consistent state and no record will
2221 * be allocated.  If rolling back a partial operation fails, we may leave some
2222 * inconsistent metadata in which case we set NVolErrors() so the volume is
2223 * left dirty when unmounted.
2224 *
2225 * Note, this function cannot make use of most of the normal functions, like
2226 * for example for attribute resizing, etc, because when the run list overflows
2227 * the base mft record and an attribute list is used, it is very important that
2228 * the extension mft records used to store the $DATA attribute of $MFT can be
2229 * reached without having to read the information contained inside them, as
2230 * this would make it impossible to find them in the first place after the
2231 * volume is unmounted.  $MFT/$BITMAP probably does not need to follow this
2232 * rule because the bitmap is not essential for finding the mft records, but on
2233 * the other hand, handling the bitmap in this special way would make life
2234 * easier because otherwise there might be circular invocations of functions
2235 * when reading the bitmap.
2236 */
2237ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
2238		ntfs_inode *base_ni, MFT_RECORD **mrec)
2239{
2240	s64 ll, bit, old_data_initialized, old_data_size;
2241	unsigned long flags;
2242	struct inode *vi;
2243	struct page *page;
2244	ntfs_inode *mft_ni, *mftbmp_ni, *ni;
2245	ntfs_attr_search_ctx *ctx;
2246	MFT_RECORD *m;
2247	ATTR_RECORD *a;
2248	pgoff_t index;
2249	unsigned int ofs;
2250	int err;
2251	le16 seq_no, usn;
2252	bool record_formatted = false;
2253
2254	if (base_ni) {
2255		ntfs_debug("Entering (allocating an extent mft record for "
2256				"base mft record 0x%llx).",
2257				(long long)base_ni->mft_no);
2258		/* @mode and @base_ni are mutually exclusive. */
2259		BUG_ON(mode);
2260	} else
2261		ntfs_debug("Entering (allocating a base mft record).");
2262	if (mode) {
2263		/* @mode and @base_ni are mutually exclusive. */
2264		BUG_ON(base_ni);
2265		/* We only support creation of normal files and directories. */
2266		if (!S_ISREG(mode) && !S_ISDIR(mode))
2267			return ERR_PTR(-EOPNOTSUPP);
2268	}
2269	BUG_ON(!mrec);
2270	mft_ni = NTFS_I(vol->mft_ino);
2271	mftbmp_ni = NTFS_I(vol->mftbmp_ino);
2272	down_write(&vol->mftbmp_lock);
2273	bit = ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(vol, base_ni);
2274	if (bit >= 0) {
2275		ntfs_debug("Found and allocated free record (#1), bit 0x%llx.",
2276				(long long)bit);
2277		goto have_alloc_rec;
2278	}
2279	if (bit != -ENOSPC) {
2280		up_write(&vol->mftbmp_lock);
2281		return ERR_PTR(bit);
2282	}
2283	/*
2284	 * No free mft records left.  If the mft bitmap already covers more
2285	 * than the currently used mft records, the next records are all free,
2286	 * so we can simply allocate the first unused mft record.
2287	 * Note: We also have to make sure that the mft bitmap at least covers
2288	 * the first 24 mft records as they are special and whilst they may not
2289	 * be in use, we do not allocate from them.
2290	 */
2291	read_lock_irqsave(&mft_ni->size_lock, flags);
2292	ll = mft_ni->initialized_size >> vol->mft_record_size_bits;
2293	read_unlock_irqrestore(&mft_ni->size_lock, flags);
2294	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
2295	old_data_initialized = mftbmp_ni->initialized_size;
2296	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
2297	if (old_data_initialized << 3 > ll && old_data_initialized > 3) {
2298		bit = ll;
2299		if (bit < 24)
2300			bit = 24;
2301		if (unlikely(bit >= (1ll << 32)))
2302			goto max_err_out;
2303		ntfs_debug("Found free record (#2), bit 0x%llx.",
2304				(long long)bit);
2305		goto found_free_rec;
2306	}
2307	/*
2308	 * The mft bitmap needs to be expanded until it covers the first unused
2309	 * mft record that we can allocate.
2310	 * Note: The smallest mft record we allocate is mft record 24.
2311	 */
2312	bit = old_data_initialized << 3;
2313	if (unlikely(bit >= (1ll << 32)))
2314		goto max_err_out;
2315	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
2316	old_data_size = mftbmp_ni->allocated_size;
2317	ntfs_debug("Status of mftbmp before extension: allocated_size 0x%llx, "
2318			"data_size 0x%llx, initialized_size 0x%llx.",
2319			(long long)old_data_size,
2320			(long long)i_size_read(vol->mftbmp_ino),
2321			(long long)old_data_initialized);
2322	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
2323	if (old_data_initialized + 8 > old_data_size) {
2324		/* Need to extend bitmap by one more cluster. */
2325		ntfs_debug("mftbmp: initialized_size + 8 > allocated_size.");
2326		err = ntfs_mft_bitmap_extend_allocation_nolock(vol);
2327		if (unlikely(err)) {
2328			up_write(&vol->mftbmp_lock);
2329			goto err_out;
2330		}
2331#ifdef DEBUG
2332		read_lock_irqsave(&mftbmp_ni->size_lock, flags);
2333		ntfs_debug("Status of mftbmp after allocation extension: "
2334				"allocated_size 0x%llx, data_size 0x%llx, "
2335				"initialized_size 0x%llx.",
2336				(long long)mftbmp_ni->allocated_size,
2337				(long long)i_size_read(vol->mftbmp_ino),
2338				(long long)mftbmp_ni->initialized_size);
2339		read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
2340#endif /* DEBUG */
2341	}
2342	/*
2343	 * We now have sufficient allocated space, extend the initialized_size
2344	 * as well as the data_size if necessary and fill the new space with
2345	 * zeroes.
2346	 */
2347	err = ntfs_mft_bitmap_extend_initialized_nolock(vol);
2348	if (unlikely(err)) {
2349		up_write(&vol->mftbmp_lock);
2350		goto err_out;
2351	}
2352#ifdef DEBUG
2353	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
2354	ntfs_debug("Status of mftbmp after initialized extention: "
2355			"allocated_size 0x%llx, data_size 0x%llx, "
2356			"initialized_size 0x%llx.",
2357			(long long)mftbmp_ni->allocated_size,
2358			(long long)i_size_read(vol->mftbmp_ino),
2359			(long long)mftbmp_ni->initialized_size);
2360	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
2361#endif /* DEBUG */
2362	ntfs_debug("Found free record (#3), bit 0x%llx.", (long long)bit);
2363found_free_rec:
2364	/* @bit is the found free mft record, allocate it in the mft bitmap. */
2365	ntfs_debug("At found_free_rec.");
2366	err = ntfs_bitmap_set_bit(vol->mftbmp_ino, bit);
2367	if (unlikely(err)) {
2368		ntfs_error(vol->sb, "Failed to allocate bit in mft bitmap.");
2369		up_write(&vol->mftbmp_lock);
2370		goto err_out;
2371	}
2372	ntfs_debug("Set bit 0x%llx in mft bitmap.", (long long)bit);
2373have_alloc_rec:
2374	/*
2375	 * The mft bitmap is now uptodate.  Deal with mft data attribute now.
2376	 * Note, we keep hold of the mft bitmap lock for writing until all
2377	 * modifications to the mft data attribute are complete, too, as they
2378	 * will impact decisions for mft bitmap and mft record allocation done
2379	 * by a parallel allocation and if the lock is not maintained a
2380	 * parallel allocation could allocate the same mft record as this one.
2381	 */
2382	ll = (bit + 1) << vol->mft_record_size_bits;
2383	read_lock_irqsave(&mft_ni->size_lock, flags);
2384	old_data_initialized = mft_ni->initialized_size;
2385	read_unlock_irqrestore(&mft_ni->size_lock, flags);
2386	if (ll <= old_data_initialized) {
2387		ntfs_debug("Allocated mft record already initialized.");
2388		goto mft_rec_already_initialized;
2389	}
2390	ntfs_debug("Initializing allocated mft record.");
2391	/*
2392	 * The mft record is outside the initialized data.  Extend the mft data
2393	 * attribute until it covers the allocated record.  The loop is only
2394	 * actually traversed more than once when a freshly formatted volume is
2395	 * first written to so it optimizes away nicely in the common case.
2396	 */
2397	read_lock_irqsave(&mft_ni->size_lock, flags);
2398	ntfs_debug("Status of mft data before extension: "
2399			"allocated_size 0x%llx, data_size 0x%llx, "
2400			"initialized_size 0x%llx.",
2401			(long long)mft_ni->allocated_size,
2402			(long long)i_size_read(vol->mft_ino),
2403			(long long)mft_ni->initialized_size);
2404	while (ll > mft_ni->allocated_size) {
2405		read_unlock_irqrestore(&mft_ni->size_lock, flags);
2406		err = ntfs_mft_data_extend_allocation_nolock(vol);
2407		if (unlikely(err)) {
2408			ntfs_error(vol->sb, "Failed to extend mft data "
2409					"allocation.");
2410			goto undo_mftbmp_alloc_nolock;
2411		}
2412		read_lock_irqsave(&mft_ni->size_lock, flags);
2413		ntfs_debug("Status of mft data after allocation extension: "
2414				"allocated_size 0x%llx, data_size 0x%llx, "
2415				"initialized_size 0x%llx.",
2416				(long long)mft_ni->allocated_size,
2417				(long long)i_size_read(vol->mft_ino),
2418				(long long)mft_ni->initialized_size);
2419	}
2420	read_unlock_irqrestore(&mft_ni->size_lock, flags);
2421	/*
2422	 * Extend mft data initialized size (and data size of course) to reach
2423	 * the allocated mft record, formatting the mft records allong the way.
2424	 * Note: We only modify the ntfs_inode structure as that is all that is
2425	 * needed by ntfs_mft_record_format().  We will update the attribute
2426	 * record itself in one fell swoop later on.
2427	 */
2428	write_lock_irqsave(&mft_ni->size_lock, flags);
2429	old_data_initialized = mft_ni->initialized_size;
2430	old_data_size = vol->mft_ino->i_size;
2431	while (ll > mft_ni->initialized_size) {
2432		s64 new_initialized_size, mft_no;
2433
2434		new_initialized_size = mft_ni->initialized_size +
2435				vol->mft_record_size;
2436		mft_no = mft_ni->initialized_size >> vol->mft_record_size_bits;
2437		if (new_initialized_size > i_size_read(vol->mft_ino))
2438			i_size_write(vol->mft_ino, new_initialized_size);
2439		write_unlock_irqrestore(&mft_ni->size_lock, flags);
2440		ntfs_debug("Initializing mft record 0x%llx.",
2441				(long long)mft_no);
2442		err = ntfs_mft_record_format(vol, mft_no);
2443		if (unlikely(err)) {
2444			ntfs_error(vol->sb, "Failed to format mft record.");
2445			goto undo_data_init;
2446		}
2447		write_lock_irqsave(&mft_ni->size_lock, flags);
2448		mft_ni->initialized_size = new_initialized_size;
2449	}
2450	write_unlock_irqrestore(&mft_ni->size_lock, flags);
2451	record_formatted = true;
2452	/* Update the mft data attribute record to reflect the new sizes. */
2453	m = map_mft_record(mft_ni);
2454	if (IS_ERR(m)) {
2455		ntfs_error(vol->sb, "Failed to map mft record.");
2456		err = PTR_ERR(m);
2457		goto undo_data_init;
2458	}
2459	ctx = ntfs_attr_get_search_ctx(mft_ni, m);
2460	if (unlikely(!ctx)) {
2461		ntfs_error(vol->sb, "Failed to get search context.");
2462		err = -ENOMEM;
2463		unmap_mft_record(mft_ni);
2464		goto undo_data_init;
2465	}
2466	err = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
2467			CASE_SENSITIVE, 0, NULL, 0, ctx);
2468	if (unlikely(err)) {
2469		ntfs_error(vol->sb, "Failed to find first attribute extent of "
2470				"mft data attribute.");
2471		ntfs_attr_put_search_ctx(ctx);
2472		unmap_mft_record(mft_ni);
2473		goto undo_data_init;
2474	}
2475	a = ctx->attr;
2476	read_lock_irqsave(&mft_ni->size_lock, flags);
2477	a->data.non_resident.initialized_size =
2478			cpu_to_sle64(mft_ni->initialized_size);
2479	a->data.non_resident.data_size =
2480			cpu_to_sle64(i_size_read(vol->mft_ino));
2481	read_unlock_irqrestore(&mft_ni->size_lock, flags);
2482	/* Ensure the changes make it to disk. */
2483	flush_dcache_mft_record_page(ctx->ntfs_ino);
2484	mark_mft_record_dirty(ctx->ntfs_ino);
2485	ntfs_attr_put_search_ctx(ctx);
2486	unmap_mft_record(mft_ni);
2487	read_lock_irqsave(&mft_ni->size_lock, flags);
2488	ntfs_debug("Status of mft data after mft record initialization: "
2489			"allocated_size 0x%llx, data_size 0x%llx, "
2490			"initialized_size 0x%llx.",
2491			(long long)mft_ni->allocated_size,
2492			(long long)i_size_read(vol->mft_ino),
2493			(long long)mft_ni->initialized_size);
2494	BUG_ON(i_size_read(vol->mft_ino) > mft_ni->allocated_size);
2495	BUG_ON(mft_ni->initialized_size > i_size_read(vol->mft_ino));
2496	read_unlock_irqrestore(&mft_ni->size_lock, flags);
2497mft_rec_already_initialized:
2498	/*
2499	 * We can finally drop the mft bitmap lock as the mft data attribute
2500	 * has been fully updated.  The only disparity left is that the
2501	 * allocated mft record still needs to be marked as in use to match the
2502	 * set bit in the mft bitmap but this is actually not a problem since
2503	 * this mft record is not referenced from anywhere yet and the fact
2504	 * that it is allocated in the mft bitmap means that no-one will try to
2505	 * allocate it either.
2506	 */
2507	up_write(&vol->mftbmp_lock);
2508	/*
2509	 * We now have allocated and initialized the mft record.  Calculate the
2510	 * index of and the offset within the page cache page the record is in.
2511	 */
2512	index = bit << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT;
2513	ofs = (bit << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
2514	/* Read, map, and pin the page containing the mft record. */
2515	page = ntfs_map_page(vol->mft_ino->i_mapping, index);
2516	if (unlikely(IS_ERR(page))) {
2517		ntfs_error(vol->sb, "Failed to map page containing allocated "
2518				"mft record 0x%llx.", (long long)bit);
2519		err = PTR_ERR(page);
2520		goto undo_mftbmp_alloc;
2521	}
2522	lock_page(page);
2523	BUG_ON(!PageUptodate(page));
2524	ClearPageUptodate(page);
2525	m = (MFT_RECORD*)((u8*)page_address(page) + ofs);
2526	/* If we just formatted the mft record no need to do it again. */
2527	if (!record_formatted) {
2528		/* Sanity check that the mft record is really not in use. */
2529		if (ntfs_is_file_record(m->magic) &&
2530				(m->flags & MFT_RECORD_IN_USE)) {
2531			ntfs_error(vol->sb, "Mft record 0x%llx was marked "
2532					"free in mft bitmap but is marked "
2533					"used itself.  Corrupt filesystem.  "
2534					"Unmount and run chkdsk.",
2535					(long long)bit);
2536			err = -EIO;
2537			SetPageUptodate(page);
2538			unlock_page(page);
2539			ntfs_unmap_page(page);
2540			NVolSetErrors(vol);
2541			goto undo_mftbmp_alloc;
2542		}
2543		/*
2544		 * We need to (re-)format the mft record, preserving the
2545		 * sequence number if it is not zero as well as the update
2546		 * sequence number if it is not zero or -1 (0xffff).  This
2547		 * means we do not need to care whether or not something went
2548		 * wrong with the previous mft record.
2549		 */
2550		seq_no = m->sequence_number;
2551		usn = *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs));
2552		err = ntfs_mft_record_layout(vol, bit, m);
2553		if (unlikely(err)) {
2554			ntfs_error(vol->sb, "Failed to layout allocated mft "
2555					"record 0x%llx.", (long long)bit);
2556			SetPageUptodate(page);
2557			unlock_page(page);
2558			ntfs_unmap_page(page);
2559			goto undo_mftbmp_alloc;
2560		}
2561		if (seq_no)
2562			m->sequence_number = seq_no;
2563		if (usn && le16_to_cpu(usn) != 0xffff)
2564			*(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = usn;
2565	}
2566	/* Set the mft record itself in use. */
2567	m->flags |= MFT_RECORD_IN_USE;
2568	if (S_ISDIR(mode))
2569		m->flags |= MFT_RECORD_IS_DIRECTORY;
2570	flush_dcache_page(page);
2571	SetPageUptodate(page);
2572	if (base_ni) {
2573		/*
2574		 * Setup the base mft record in the extent mft record.  This
2575		 * completes initialization of the allocated extent mft record
2576		 * and we can simply use it with map_extent_mft_record().
2577		 */
2578		m->base_mft_record = MK_LE_MREF(base_ni->mft_no,
2579				base_ni->seq_no);
2580		/*
2581		 * Allocate an extent inode structure for the new mft record,
2582		 * attach it to the base inode @base_ni and map, pin, and lock
2583		 * its, i.e. the allocated, mft record.
2584		 */
2585		m = map_extent_mft_record(base_ni, bit, &ni);
2586		if (IS_ERR(m)) {
2587			ntfs_error(vol->sb, "Failed to map allocated extent "
2588					"mft record 0x%llx.", (long long)bit);
2589			err = PTR_ERR(m);
2590			/* Set the mft record itself not in use. */
2591			m->flags &= cpu_to_le16(
2592					~le16_to_cpu(MFT_RECORD_IN_USE));
2593			flush_dcache_page(page);
2594			/* Make sure the mft record is written out to disk. */
2595			mark_ntfs_record_dirty(page, ofs);
2596			unlock_page(page);
2597			ntfs_unmap_page(page);
2598			goto undo_mftbmp_alloc;
2599		}
2600		/*
2601		 * Make sure the allocated mft record is written out to disk.
2602		 * No need to set the inode dirty because the caller is going
2603		 * to do that anyway after finishing with the new extent mft
2604		 * record (e.g. at a minimum a new attribute will be added to
2605		 * the mft record.
2606		 */
2607		mark_ntfs_record_dirty(page, ofs);
2608		unlock_page(page);
2609		/*
2610		 * Need to unmap the page since map_extent_mft_record() mapped
2611		 * it as well so we have it mapped twice at the moment.
2612		 */
2613		ntfs_unmap_page(page);
2614	} else {
2615		/*
2616		 * Allocate a new VFS inode and set it up.  NOTE: @vi->i_nlink
2617		 * is set to 1 but the mft record->link_count is 0.  The caller
2618		 * needs to bear this in mind.
2619		 */
2620		vi = new_inode(vol->sb);
2621		if (unlikely(!vi)) {
2622			err = -ENOMEM;
2623			/* Set the mft record itself not in use. */
2624			m->flags &= cpu_to_le16(
2625					~le16_to_cpu(MFT_RECORD_IN_USE));
2626			flush_dcache_page(page);
2627			/* Make sure the mft record is written out to disk. */
2628			mark_ntfs_record_dirty(page, ofs);
2629			unlock_page(page);
2630			ntfs_unmap_page(page);
2631			goto undo_mftbmp_alloc;
2632		}
2633		vi->i_ino = bit;
2634		/*
2635		 * This is for checking whether an inode has changed w.r.t. a
2636		 * file so that the file can be updated if necessary (compare
2637		 * with f_version).
2638		 */
2639		vi->i_version = 1;
2640
2641		/* The owner and group come from the ntfs volume. */
2642		vi->i_uid = vol->uid;
2643		vi->i_gid = vol->gid;
2644
2645		/* Initialize the ntfs specific part of @vi. */
2646		ntfs_init_big_inode(vi);
2647		ni = NTFS_I(vi);
2648		/*
2649		 * Set the appropriate mode, attribute type, and name.  For
2650		 * directories, also setup the index values to the defaults.
2651		 */
2652		if (S_ISDIR(mode)) {
2653			vi->i_mode = S_IFDIR | S_IRWXUGO;
2654			vi->i_mode &= ~vol->dmask;
2655
2656			NInoSetMstProtected(ni);
2657			ni->type = AT_INDEX_ALLOCATION;
2658			ni->name = I30;
2659			ni->name_len = 4;
2660
2661			ni->itype.index.block_size = 4096;
2662			ni->itype.index.block_size_bits = ntfs_ffs(4096) - 1;
2663			ni->itype.index.collation_rule = COLLATION_FILE_NAME;
2664			if (vol->cluster_size <= ni->itype.index.block_size) {
2665				ni->itype.index.vcn_size = vol->cluster_size;
2666				ni->itype.index.vcn_size_bits =
2667						vol->cluster_size_bits;
2668			} else {
2669				ni->itype.index.vcn_size = vol->sector_size;
2670				ni->itype.index.vcn_size_bits =
2671						vol->sector_size_bits;
2672			}
2673		} else {
2674			vi->i_mode = S_IFREG | S_IRWXUGO;
2675			vi->i_mode &= ~vol->fmask;
2676
2677			ni->type = AT_DATA;
2678			ni->name = NULL;
2679			ni->name_len = 0;
2680		}
2681		if (IS_RDONLY(vi))
2682			vi->i_mode &= ~S_IWUGO;
2683
2684		/* Set the inode times to the current time. */
2685		vi->i_atime = vi->i_mtime = vi->i_ctime =
2686			current_fs_time(vi->i_sb);
2687		/*
2688		 * Set the file size to 0, the ntfs inode sizes are set to 0 by
2689		 * the call to ntfs_init_big_inode() below.
2690		 */
2691		vi->i_size = 0;
2692		vi->i_blocks = 0;
2693
2694		/* Set the sequence number. */
2695		vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
2696		/*
2697		 * Manually map, pin, and lock the mft record as we already
2698		 * have its page mapped and it is very easy to do.
2699		 */
2700		atomic_inc(&ni->count);
2701		mutex_lock(&ni->mrec_lock);
2702		ni->page = page;
2703		ni->page_ofs = ofs;
2704		/*
2705		 * Make sure the allocated mft record is written out to disk.
2706		 * NOTE: We do not set the ntfs inode dirty because this would
2707		 * fail in ntfs_write_inode() because the inode does not have a
2708		 * standard information attribute yet.  Also, there is no need
2709		 * to set the inode dirty because the caller is going to do
2710		 * that anyway after finishing with the new mft record (e.g. at
2711		 * a minimum some new attributes will be added to the mft
2712		 * record.
2713		 */
2714		mark_ntfs_record_dirty(page, ofs);
2715		unlock_page(page);
2716
2717		/* Add the inode to the inode hash for the superblock. */
2718		insert_inode_hash(vi);
2719
2720		/* Update the default mft allocation position. */
2721		vol->mft_data_pos = bit + 1;
2722	}
2723	/*
2724	 * Return the opened, allocated inode of the allocated mft record as
2725	 * well as the mapped, pinned, and locked mft record.
2726	 */
2727	ntfs_debug("Returning opened, allocated %sinode 0x%llx.",
2728			base_ni ? "extent " : "", (long long)bit);
2729	*mrec = m;
2730	return ni;
2731undo_data_init:
2732	write_lock_irqsave(&mft_ni->size_lock, flags);
2733	mft_ni->initialized_size = old_data_initialized;
2734	i_size_write(vol->mft_ino, old_data_size);
2735	write_unlock_irqrestore(&mft_ni->size_lock, flags);
2736	goto undo_mftbmp_alloc_nolock;
2737undo_mftbmp_alloc:
2738	down_write(&vol->mftbmp_lock);
2739undo_mftbmp_alloc_nolock:
2740	if (ntfs_bitmap_clear_bit(vol->mftbmp_ino, bit)) {
2741		ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es);
2742		NVolSetErrors(vol);
2743	}
2744	up_write(&vol->mftbmp_lock);
2745err_out:
2746	return ERR_PTR(err);
2747max_err_out:
2748	ntfs_warning(vol->sb, "Cannot allocate mft record because the maximum "
2749			"number of inodes (2^32) has already been reached.");
2750	up_write(&vol->mftbmp_lock);
2751	return ERR_PTR(-ENOSPC);
2752}
2753
2754/**
2755 * ntfs_extent_mft_record_free - free an extent mft record on an ntfs volume
2756 * @ni:		ntfs inode of the mapped extent mft record to free
2757 * @m:		mapped extent mft record of the ntfs inode @ni
2758 *
2759 * Free the mapped extent mft record @m of the extent ntfs inode @ni.
2760 *
2761 * Note that this function unmaps the mft record and closes and destroys @ni
2762 * internally and hence you cannot use either @ni nor @m any more after this
2763 * function returns success.
2764 *
2765 * On success return 0 and on error return -errno.  @ni and @m are still valid
2766 * in this case and have not been freed.
2767 *
2768 * For some errors an error message is displayed and the success code 0 is
2769 * returned and the volume is then left dirty on umount.  This makes sense in
2770 * case we could not rollback the changes that were already done since the
2771 * caller no longer wants to reference this mft record so it does not matter to
2772 * the caller if something is wrong with it as long as it is properly detached
2773 * from the base inode.
2774 */
2775int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m)
2776{
2777	unsigned long mft_no = ni->mft_no;
2778	ntfs_volume *vol = ni->vol;
2779	ntfs_inode *base_ni;
2780	ntfs_inode **extent_nis;
2781	int i, err;
2782	le16 old_seq_no;
2783	u16 seq_no;
2784
2785	BUG_ON(NInoAttr(ni));
2786	BUG_ON(ni->nr_extents != -1);
2787
2788	mutex_lock(&ni->extent_lock);
2789	base_ni = ni->ext.base_ntfs_ino;
2790	mutex_unlock(&ni->extent_lock);
2791
2792	BUG_ON(base_ni->nr_extents <= 0);
2793
2794	ntfs_debug("Entering for extent inode 0x%lx, base inode 0x%lx.\n",
2795			mft_no, base_ni->mft_no);
2796
2797	mutex_lock(&base_ni->extent_lock);
2798
2799	/* Make sure we are holding the only reference to the extent inode. */
2800	if (atomic_read(&ni->count) > 2) {
2801		ntfs_error(vol->sb, "Tried to free busy extent inode 0x%lx, "
2802				"not freeing.", base_ni->mft_no);
2803		mutex_unlock(&base_ni->extent_lock);
2804		return -EBUSY;
2805	}
2806
2807	/* Dissociate the ntfs inode from the base inode. */
2808	extent_nis = base_ni->ext.extent_ntfs_inos;
2809	err = -ENOENT;
2810	for (i = 0; i < base_ni->nr_extents; i++) {
2811		if (ni != extent_nis[i])
2812			continue;
2813		extent_nis += i;
2814		base_ni->nr_extents--;
2815		memmove(extent_nis, extent_nis + 1, (base_ni->nr_extents - i) *
2816				sizeof(ntfs_inode*));
2817		err = 0;
2818		break;
2819	}
2820
2821	mutex_unlock(&base_ni->extent_lock);
2822
2823	if (unlikely(err)) {
2824		ntfs_error(vol->sb, "Extent inode 0x%lx is not attached to "
2825				"its base inode 0x%lx.", mft_no,
2826				base_ni->mft_no);
2827		BUG();
2828	}
2829
2830	/*
2831	 * The extent inode is no longer attached to the base inode so no one
2832	 * can get a reference to it any more.
2833	 */
2834
2835	/* Mark the mft record as not in use. */
2836	m->flags &= const_cpu_to_le16(~const_le16_to_cpu(MFT_RECORD_IN_USE));
2837
2838	/* Increment the sequence number, skipping zero, if it is not zero. */
2839	old_seq_no = m->sequence_number;
2840	seq_no = le16_to_cpu(old_seq_no);
2841	if (seq_no == 0xffff)
2842		seq_no = 1;
2843	else if (seq_no)
2844		seq_no++;
2845	m->sequence_number = cpu_to_le16(seq_no);
2846
2847	/*
2848	 * Set the ntfs inode dirty and write it out.  We do not need to worry
2849	 * about the base inode here since whatever caused the extent mft
2850	 * record to be freed is guaranteed to do it already.
2851	 */
2852	NInoSetDirty(ni);
2853	err = write_mft_record(ni, m, 0);
2854	if (unlikely(err)) {
2855		ntfs_error(vol->sb, "Failed to write mft record 0x%lx, not "
2856				"freeing.", mft_no);
2857		goto rollback;
2858	}
2859rollback_error:
2860	/* Unmap and throw away the now freed extent inode. */
2861	unmap_extent_mft_record(ni);
2862	ntfs_clear_extent_inode(ni);
2863
2864	/* Clear the bit in the $MFT/$BITMAP corresponding to this record. */
2865	down_write(&vol->mftbmp_lock);
2866	err = ntfs_bitmap_clear_bit(vol->mftbmp_ino, mft_no);
2867	up_write(&vol->mftbmp_lock);
2868	if (unlikely(err)) {
2869		/*
2870		 * The extent inode is gone but we failed to deallocate it in
2871		 * the mft bitmap.  Just emit a warning and leave the volume
2872		 * dirty on umount.
2873		 */
2874		ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es);
2875		NVolSetErrors(vol);
2876	}
2877	return 0;
2878rollback:
2879	/* Rollback what we did... */
2880	mutex_lock(&base_ni->extent_lock);
2881	extent_nis = base_ni->ext.extent_ntfs_inos;
2882	if (!(base_ni->nr_extents & 3)) {
2883		int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode*);
2884
2885		extent_nis = kmalloc(new_size, GFP_NOFS);
2886		if (unlikely(!extent_nis)) {
2887			ntfs_error(vol->sb, "Failed to allocate internal "
2888					"buffer during rollback.%s", es);
2889			mutex_unlock(&base_ni->extent_lock);
2890			NVolSetErrors(vol);
2891			goto rollback_error;
2892		}
2893		if (base_ni->nr_extents) {
2894			BUG_ON(!base_ni->ext.extent_ntfs_inos);
2895			memcpy(extent_nis, base_ni->ext.extent_ntfs_inos,
2896					new_size - 4 * sizeof(ntfs_inode*));
2897			kfree(base_ni->ext.extent_ntfs_inos);
2898		}
2899		base_ni->ext.extent_ntfs_inos = extent_nis;
2900	}
2901	m->flags |= MFT_RECORD_IN_USE;
2902	m->sequence_number = old_seq_no;
2903	extent_nis[base_ni->nr_extents++] = ni;
2904	mutex_unlock(&base_ni->extent_lock);
2905	mark_mft_record_dirty(ni);
2906	return err;
2907}
2908#endif /* NTFS_RW */
2909