1/*
2 * ntfs_index.c - NTFS kernel index handling.
3 *
4 * Copyright (c) 2006-2008 Anton Altaparmakov.  All Rights Reserved.
5 * Portions Copyright (c) 2006-2008 Apple Inc.  All Rights Reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 *    this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 * 3. Neither the name of Apple Inc. ("Apple") nor the names of its
16 *    contributors may be used to endorse or promote products derived from this
17 *    software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
20 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
23 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
26 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 *
30 * ALTERNATIVELY, provided that this notice and licensing terms are retained in
31 * full, this file may be redistributed and/or modified under the terms of the
32 * GNU General Public License (GPL) Version 2, in which case the provisions of
33 * that version of the GPL will apply to you instead of the license terms
34 * above.  You can obtain a copy of the GPL Version 2 at
35 * http://developer.apple.com/opensource/licenses/gpl-2.txt.
36 */
37
38#include <sys/errno.h>
39
40#include <string.h>
41
42#include <libkern/libkern.h>
43#include <libkern/OSMalloc.h>
44
45#include <kern/debug.h>
46#include <kern/locks.h>
47
48#include "ntfs.h"
49#include "ntfs_attr.h"
50#include "ntfs_attr_list.h"
51#include "ntfs_bitmap.h"
52#include "ntfs_collate.h"
53#include "ntfs_debug.h"
54#include "ntfs_dir.h"
55#include "ntfs_endian.h"
56#include "ntfs_index.h"
57#include "ntfs_inode.h"
58#include "ntfs_layout.h"
59#include "ntfs_lcnalloc.h"
60#include "ntfs_mft.h"
61#include "ntfs_page.h"
62#include "ntfs_runlist.h"
63#include "ntfs_types.h"
64#include "ntfs_unistr.h"
65#include "ntfs_volume.h"
66
67/**
68 * ntfs_index_ctx_unlock - unlock an index context
69 * @ictx:	index context to unlock
70 *
71 * Unlock the index context @ictx.  We also unmap the mft record (in index root
72 * case) or the page (in index allocation block case) thus all pointers into
73 * the index node need to be revalidated when the mft record or page is mapped
74 * again in ntfs_index_ctx_relock().
75 *
76 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode.
77 *	    - The index context @ictx must be locked.
78 */
79void ntfs_index_ctx_unlock(ntfs_index_context *ictx)
80{
81	if (!ictx)
82		panic("%s(): !ictx\n", __FUNCTION__);
83	if (!ictx->is_locked)
84		panic("%s(): !ictx->is_locked\n", __FUNCTION__);
85	if (ictx->is_root) {
86		if (ictx->actx) {
87			ntfs_attr_search_ctx_put(ictx->actx);
88			if (!ictx->base_ni)
89				panic("%s(): !ictx->base_ni\n", __FUNCTION__);
90			ntfs_mft_record_unmap(ictx->base_ni);
91			ictx->actx = NULL;
92		}
93	} else {
94		if (ictx->upl) {
95			ntfs_page_unmap(ictx->idx_ni, ictx->upl, ictx->pl,
96					ictx->is_dirty);
97			ictx->upl = NULL;
98			ictx->is_dirty = 0;
99		}
100	}
101	ictx->is_locked = 0;
102}
103
104/**
105 * ntfs_index_ctx_path_unlock - unlock an entire B+tree index context path
106 * @index_ctx:	index context whose whole path to unlock
107 *
108 * Unlock all index contexts attached to the B+tree path to which @index_ctx
109 * belongs.
110 *
111 * This function is only called in error handling to ensure nothing is held
112 * busy so the error handling code cannot deadlock.
113 *
114 * Locking: Caller must hold @index_ctx->idx_ni->lock on the index inode.
115 */
116static void ntfs_index_ctx_path_unlock(ntfs_index_context *index_ctx)
117{
118	ntfs_index_context *ictx;
119
120	ictx = index_ctx;
121	if (!ictx)
122		panic("%s(): !ictx\n", __FUNCTION__);
123	/*
124	 * Note we traverse the tree path backwards (upwards) because @up is
125	 * the first element in the index_context structure thus doing things
126	 * this way generates faster/better machine code.
127	 */
128	do {
129		if (ictx->is_locked)
130			ntfs_index_ctx_unlock(ictx);
131	} while ((ictx = ictx->up) != index_ctx);
132}
133
134/**
135 * ntfs_index_ctx_relock - relock an index context that was unlocked earlier
136 * @ictx:	index context to relock
137 *
138 * Relock the index context @ictx after it was unlocked with
139 * ntfs_index_ctx_unlock().  We also remap the mft record (in index root case)
140 * or the page (in index allocation block case) after which we revalidate all
141 * pointers into the index node because the page may have been mapped into a
142 * different virtual address and the mft record may have been changed with the
143 * result that the index root attribute is moved within the mft record or even
144 * to a completely different mft record.
145 *
146 * Note the check whether to revalidate or not is very simple because the index
147 * node content cannot have changed thus all points change by a fixed offset
148 * delta which once determined can be applied to all pointers.
149 *
150 * In the index root case, there is also a non-pointer index context field that
151 * can have changed (and it does so irrespective of the index root position).
152 * This is @ictx->bytes_free as that is dependent on the other attributes in
153 * the mft record which can have changed legitimately under our feet which of
154 * course is the reason why the index root can have moved about, too.
155 *
156 * Return 0 on success and errno on error.
157 *
158 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode.
159 *	    - The index context @ictx must not be locked.
160 */
161errno_t ntfs_index_ctx_relock(ntfs_index_context *ictx)
162{
163	long delta;
164	errno_t err;
165
166	if (!ictx)
167		panic("%s(): !ictx\n", __FUNCTION__);
168	if (ictx->is_locked)
169		panic("%s(): ictx->is_locked\n", __FUNCTION__);
170	if (ictx->is_root) {
171		MFT_RECORD *m;
172		ntfs_attr_search_ctx *actx;
173
174		if (ictx->actx)
175			panic("%s(): ictx->actx\n", __FUNCTION__);
176		if (!ictx->base_ni)
177			panic("%s(): !ictx->base_ni\n", __FUNCTION__);
178		err = ntfs_mft_record_map(ictx->base_ni, &m);
179		if (err) {
180			ntfs_error(ictx->idx_ni->vol->mp, "Failed to lock "
181					"index root because "
182					"ntfs_mft_record_map() failed (error "
183					"%d).", err);
184			return err;
185		}
186		ictx->actx = actx = ntfs_attr_search_ctx_get(ictx->base_ni, m);
187		if (!actx) {
188			ntfs_error(ictx->idx_ni->vol->mp, "Failed to allocate "
189					"attribute search context.");
190			err = ENOMEM;
191			goto actx_err;
192		}
193		err = ntfs_attr_lookup(AT_INDEX_ROOT, ictx->idx_ni->name,
194				ictx->idx_ni->name_len, 0, NULL, 0, actx);
195		if (err) {
196			if (err == ENOENT)
197				panic("%s(): err == ENOENT\n", __FUNCTION__);
198			ntfs_error(ictx->idx_ni->vol->mp, "Failed to look up "
199					"index root attribute (error %d).",
200					err);
201			goto actx_err;
202		}
203		ictx->bytes_free = le32_to_cpu(actx->m->bytes_allocated) -
204				le32_to_cpu(actx->m->bytes_in_use);
205		/* Get to the index root value. */
206		ictx->ir = (INDEX_ROOT*)((u8*)actx->a +
207				le16_to_cpu(actx->a->value_offset));
208		delta = (u8*)&ictx->ir->index - (u8*)ictx->index;
209		ictx->index = &ictx->ir->index;
210	} else {
211		u8 *addr;
212
213		if (ictx->upl)
214			panic("%s(): ictx->upl\n", __FUNCTION__);
215		if (ictx->is_dirty)
216			panic("%s(): ictx->is_dirty\n", __FUNCTION__);
217		err = ntfs_page_map(ictx->idx_ni, ictx->upl_ofs, &ictx->upl,
218				&ictx->pl, &addr, TRUE);
219		if (err) {
220			ntfs_error(ictx->idx_ni->vol->mp, "Failed to map "
221					"index page (error %d).", err);
222			ictx->upl = NULL;
223			return err;
224		}
225		/* Get to the index allocation block. */
226		delta = addr - ictx->addr;
227		if (delta) {
228			ictx->addr = addr;
229			ictx->ia = (INDEX_ALLOCATION*)((u8*)ictx->ia + delta);
230			ictx->index = &ictx->ia->index;
231		}
232	}
233	if (delta) {
234		INDEX_ENTRY **entries;
235		unsigned u;
236
237		/*
238		 * The index node has moved thus we have to update all stored
239		 * pointers so they point into the new memory location.
240		 */
241		ictx->entry = (INDEX_ENTRY*)((u8*)ictx->entry + delta);
242		entries = ictx->entries;
243		for (u = 0; u < ictx->nr_entries; u++)
244			entries[u] = (INDEX_ENTRY*)((u8*)entries[u] + delta);
245	}
246	ictx->is_locked = 1;
247	return 0;
248actx_err:
249	if (ictx->actx) {
250		ntfs_attr_search_ctx_put(ictx->actx);
251		ictx->actx = NULL;
252	}
253	ntfs_mft_record_unmap(ictx->base_ni);
254	return err;
255}
256
257/**
258 * ntfs_index_ctx_get - allocate and initialize a new index context
259 * @idx_ni:	ntfs index inode with which to initialize the context
260 *
261 * Allocate a new index context, initialize it with @idx_ni and return it.
262 *
263 * Return NULL if the allocation failed.
264 *
265 * Locking: Caller must hold @idx_ni->lock on the index inode.
266 */
267ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni)
268{
269	ntfs_index_context *ictx;
270
271	ictx = ntfs_index_ctx_alloc();
272	if (ictx)
273		ntfs_index_ctx_init(ictx, idx_ni);
274	return ictx;
275}
276
277/**
278 * ntfs_index_ctx_put_reuse_single - release an index context but do not free it
279 * @ictx:	index context to release
280 *
281 * Release the index context @ictx, releasing all associated resources but keep
282 * the index context itself allocated so it can be reused with a call to
283 * ntfs_index_ctx_init().
284 *
285 * This function ignores the tree path which this entry may be a part of and is
286 * only a helper function for ntfs_index_ctx_put_reuse().
287 *
288 * Locking: Caller must hold @ictx->idx_ni->lock on the index inode.
289 */
290void ntfs_index_ctx_put_reuse_single(ntfs_index_context *ictx)
291{
292	if (ictx->entry && ictx->is_locked)
293		ntfs_index_ctx_unlock(ictx);
294	if (ictx->entries)
295		OSFree(ictx->entries, ictx->max_entries * sizeof(INDEX_ENTRY*),
296				ntfs_malloc_tag);
297}
298
299/**
300 * ntfs_index_ctx_put_reuse - release an index context but do not free it
301 * @index_ctx:	index context to release
302 *
303 * Release the index context @index_ctx, releasing all associated resources but
304 * keep the index context itself allocated so it can be reused with a call to
305 * ntfs_index_ctx_init().
306 *
307 * Locking: Caller must hold @index_ctx->idx_ni->lock on the index inode.
308 */
309void ntfs_index_ctx_put_reuse(ntfs_index_context *index_ctx)
310{
311	ntfs_index_context *ictx, *next;
312
313	/*
314	 * Destroy all tree path components except @index_ctx itself.  We need
315	 * the temporary index context pointer @next because we deallocate the
316	 * current index context in each iteration of the loop thus we would no
317	 * longer have any means of finding the next index context in the tree
318	 * path if we had not already stored a pointer to it in @next.  Note we
319	 * actually traverse the tree path backwards (upwards) because @up is
320	 * the first element in the index_context structure thus doing things
321	 * this way generates faster/better machine code.
322	 */
323	for (ictx = index_ctx->up, next = ictx->up; ictx != index_ctx;
324			ictx = next, next = next->up) {
325		/*
326		 * Disconnect the current index context from the tree and
327		 * release it and all its resources.
328		 */
329		ntfs_index_ctx_put_single(ictx);
330	}
331	/* Reuse the only remaining, bottom entry. */
332	ntfs_index_ctx_put_reuse_single(index_ctx);
333}
334
335/**
336 * ntfs_index_get_entries - get the entries for the index node into the context
337 * @ictx:	index context for which to get the entries
338 *
339 * Loop through the entries in the index node described by @ictx and gather all
340 * the index entries into the @ictx->entries array which is also allocated by
341 * this function.
342 *
343 * Return 0 on success and errno on error.
344 *
345 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode.
346 *	    - The index context @ictx must be locked.
347 */
348static errno_t ntfs_index_get_entries(ntfs_index_context *ictx)
349{
350	u8 *index_end;
351	INDEX_HEADER *index;
352	INDEX_ENTRY *ie, **entries;
353	unsigned nr_entries, max_entries;
354	errno_t err;
355	BOOL is_view;
356
357	ntfs_debug("Entering.");
358	if (!ictx->is_locked)
359		panic("%s(): !ictx->is_locked\n", __FUNCTION__);
360	is_view = FALSE;
361	if (ictx->idx_ni->name != I30)
362		is_view = TRUE;
363	nr_entries = 0;
364	max_entries = ictx->max_entries;
365	/* Allocate memory for the index entry pointers in the index node. */
366	entries = OSMalloc(max_entries * sizeof(INDEX_ENTRY*), ntfs_malloc_tag);
367	if (!entries) {
368		ntfs_error(ictx->idx_ni->vol->mp, "Failed to allocate index "
369				"entry pointer array.");
370		return ENOMEM;
371	}
372	index = ictx->index;
373	index_end = (u8*)index + le32_to_cpu(index->index_length);
374	/* The first index entry. */
375	ie = (INDEX_ENTRY*)((u8*)index + le32_to_cpu(index->entries_offset));
376	/*
377	 * Loop until we exceed valid memory (corruption case) or until we
378	 * reach the last entry and for each entry place a pointer to it into
379	 * our array of entry pointers.
380	 */
381	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
382		/* Bounds checks. */
383		if ((u8*)ie < (u8*)index || (u8*)ie +
384				sizeof(INDEX_ENTRY_HEADER) > index_end ||
385				(u8*)ie + le16_to_cpu(ie->length) > index_end ||
386				(u32)sizeof(INDEX_ENTRY_HEADER) +
387				le16_to_cpu(ie->key_length) >
388				le16_to_cpu(ie->length))
389			goto err;
390		/* Add this entry to the array of entry pointers. */
391		if (nr_entries >= max_entries)
392			panic("%s(): nr_entries >= max_entries\n",
393					__FUNCTION__);
394		entries[nr_entries++] = ie;
395		if (ie->flags & INDEX_ENTRY_END)
396			break;
397		/* Further bounds checks for view indexes. */
398		if (is_view && ((u32)sizeof(INDEX_ENTRY_HEADER) +
399				le16_to_cpu(ie->key_length) >
400				le16_to_cpu(ie->data_offset) ||
401				(u32)le16_to_cpu(ie->data_offset) +
402				le16_to_cpu(ie->data_length) >
403				le16_to_cpu(ie->length)))
404			goto err;
405	}
406	/* Except for the index root, leaf nodes are not allowed to be empty. */
407	if (nr_entries < 2 && !ictx->is_root && !(index->flags & INDEX_NODE)) {
408		ntfs_error(ictx->idx_ni->vol->mp, "Illegal empty leaf node "
409				"found in index.");
410		err = EIO;
411		goto err;
412	}
413	ictx->entries = entries;
414	ictx->nr_entries = nr_entries;
415	ntfs_debug("Done.");
416	return 0;
417err:
418	OSFree(entries, max_entries * sizeof(INDEX_ENTRY*), ntfs_malloc_tag);
419	ntfs_error(ictx->idx_ni->vol->mp, "Corrupt index in inode 0x%llx.  "
420			"Run chkdsk.",
421			(unsigned long long)ictx->idx_ni->mft_no);
422	NVolSetErrors(ictx->idx_ni->vol);
423	return EIO;
424}
425
426/**
427 * ntfs_index_lookup_init - prepare an index context for a lookup
428 * @ictx:	[IN] index context to prepare
429 * @key_len:	[IN] size of the key ntfs_index_lookup() is called with in bytes
430 *
431 * Prepare the index context @ictx for a call to ntfs_index_lookup() or
432 * ntfs_index_lookup_by_position().
433 *
434 * @key_len is the length of the key ntfs_index_lookup() will be called with.
435 * If the index @ictx is a directory index this is ignored.
436 *
437 * Return 0 on success and errno on error.
438 */
439static errno_t ntfs_index_lookup_init(ntfs_index_context *ictx,
440		const int key_len)
441{
442	ntfs_inode *idx_ni;
443	MFT_RECORD *m;
444	ntfs_attr_search_ctx *actx;
445	INDEX_ROOT *ir;
446	unsigned min_entry_size, max_entries;
447	errno_t err;
448
449	ntfs_debug("Entering.");
450	if (!ictx->base_ni)
451		panic("%s(): !ictx->base_ni\n", __FUNCTION__);
452	idx_ni = ictx->idx_ni;
453	if (idx_ni->type != AT_INDEX_ALLOCATION)
454		panic("%s(): idx_ni->type != AT_INDEX_ALLOCATION\n",
455				__FUNCTION__);
456	/*
457	 * Ensure the index context is still uninitialized, i.e. it is not
458	 * legal to call ntfs_index_lookup*() with a search context that has
459	 * been used already.
460	 */
461	if (ictx->up != ictx || ictx->max_entries)
462		panic("%s(): Called for already used index context.\n",
463				__FUNCTION__);
464	if (!ntfs_is_collation_rule_supported(idx_ni->collation_rule)) {
465		ntfs_error(idx_ni->vol->mp, "Index uses unsupported collation "
466				"rule 0x%x.  Aborting.",
467				le32_to_cpu(idx_ni->collation_rule));
468		return ENOTSUP;
469	}
470	/* Get hold of the mft record for the index inode. */
471	err = ntfs_mft_record_map(ictx->base_ni, &m);
472	if (err) {
473		ntfs_error(idx_ni->vol->mp, "Failed to map mft record (error "
474				"%d.", err);
475		return err;
476	}
477	actx = ntfs_attr_search_ctx_get(ictx->base_ni, m);
478	if (!actx) {
479		err = ENOMEM;
480		goto err;
481	}
482	/* Find the index root attribute in the mft record. */
483	err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len,
484			0, NULL, 0, actx);
485	if (err) {
486		if (err == ENOENT) {
487			ntfs_error(idx_ni->vol->mp, "Index root attribute "
488					"missing in inode 0x%llx.  Inode is "
489					"corrupt.  Run chkdsk.",
490					(unsigned long long)idx_ni->mft_no);
491			/*
492			 * This will cause the returned error to be EIO and the
493			 * volume to be marked as containing errors.
494			 */
495			err = 0;
496		}
497		goto err;
498	}
499	/*
500	 * The entry size is made up of the index entry header plus the index
501	 * key plus the index data plus optionally the sub-node vcn pointer.
502	 *
503	 * For most index types the index key is constant size and is simply
504	 * given by the caller supplied @key_len.
505	 *
506	 * The only collation types with variable sized keys are
507	 * COLLATION_FILENAME and COLLATION_NTOFS_SID.
508	 *
509	 * Determining the index data size is more complicated than that so we
510	 * simply ignore it.  This means we waste some memory but it is not too
511	 * bad as only directory indexes appear more than once on a volume thus
512	 * only they can have more than one lookup in progress at any one time.
513	 */
514	min_entry_size = sizeof(INDEX_ENTRY_HEADER) + sizeof(FILENAME_ATTR);
515	if (idx_ni->collation_rule != COLLATION_FILENAME) {
516		if (idx_ni->collation_rule == COLLATION_NTOFS_SID)
517			min_entry_size = sizeof(INDEX_ENTRY_HEADER) +
518					sizeof(SID);
519		else
520			min_entry_size = sizeof(INDEX_ENTRY_HEADER) + key_len;
521	}
522	/*
523	 * Work out the absolute maximum number of entries there can be in an
524	 * index allocation block.  We add one for the end entry which does not
525	 * contain a key.
526	 */
527	max_entries = 1 + ((idx_ni->block_size - sizeof(INDEX_BLOCK) -
528			sizeof(INDEX_ENTRY_HEADER)) / min_entry_size);
529	/*
530	 * Should the mft record size exceed the size of an index block (this
531	 * should never really happen) then calculate the maximum number of
532	 * entries there can be in the index root and if they are more than the
533	 * ones in the index block use that as the maximum value.
534	 */
535	if (idx_ni->vol->mft_record_size > idx_ni->block_size) {
536		unsigned max_root_entries;
537
538		max_root_entries = 1 + ((idx_ni->vol->mft_record_size -
539				le16_to_cpu(m->attrs_offset) -
540				offsetof(ATTR_RECORD, reservedR) -
541				sizeof(((ATTR_RECORD*)NULL)->reservedR) -
542				sizeof(INDEX_ROOT) -
543				sizeof(INDEX_ENTRY_HEADER)) / min_entry_size);
544		if (max_root_entries > max_entries)
545			max_entries = max_root_entries;
546	}
547	ictx->max_entries = max_entries;
548	/*
549	 * Get to the index root value (it has been verified when the inode was
550	 * read in ntfs_index_inode_read()).
551	 */
552	ir = (INDEX_ROOT*)((u8*)actx->a + le16_to_cpu(actx->a->value_offset));
553	ictx->index = &ir->index;
554	/*
555	 * Gather the index entry pointers and finish setting up the index
556	 * context.
557	 */
558	ictx->is_root = 1;
559	ictx->is_locked = 1;
560	err = ntfs_index_get_entries(ictx);
561	if (err) {
562		ictx->is_locked = 0;
563		goto err;
564	}
565	ictx->ir = ir;
566	ictx->actx = actx;
567	ictx->bytes_free = le32_to_cpu(actx->m->bytes_allocated) -
568			le32_to_cpu(actx->m->bytes_in_use);
569	ntfs_debug("Done.");
570	return 0;
571err:
572	if (actx)
573		ntfs_attr_search_ctx_put(actx);
574	ntfs_mft_record_unmap(ictx->base_ni);
575	ntfs_error(idx_ni->vol->mp, "Failed (error %d).", err);
576	return err;
577}
578
579/**
580 * ntfs_index_descend_into_child_node - index node whose child node to return
581 * @index_ctx:	pointer to index context whose child node to return
582 *
583 * Descend into the child node pointed to by (*@index_ctx)->entry and return
584 * its fully set up index context in *@index_ctx.
585 *
586 * Return 0 on success and errno on error.
587 *
588 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode.
589 *	    - The index context @ictx must be locked.
590 */
591static errno_t ntfs_index_descend_into_child_node(
592		ntfs_index_context **index_ctx)
593{
594	VCN vcn;
595	s64 ofs;
596	ntfs_index_context *ictx, *new_ictx;
597	ntfs_inode *idx_ni;
598	upl_t upl;
599	upl_page_info_array_t pl;
600	u8 *addr;
601	INDEX_ALLOCATION *ia;
602	errno_t err = 0;
603	int is_dirty = 0;
604	static const char es[] = "%s.  Inode 0x%llx is corrupt.  Run chkdsk.";
605
606	ntfs_debug("Entering.");
607	if (!index_ctx)
608		panic("%s(): !index_ctx\n", __FUNCTION__);
609	ictx = *index_ctx;
610	if (!ictx)
611		panic("%s(): !ictx\n", __FUNCTION__);
612	idx_ni = ictx->idx_ni;
613	if (!ictx->is_locked)
614		panic("%s(): !ictx->is_locked\n", __FUNCTION__);
615	if (!(ictx->entry->flags & INDEX_ENTRY_NODE))
616		panic("%s(): !(ictx->entry->flags & INDEX_ENTRY_NODE)\n",
617				__FUNCTION__);
618	/*
619	 * Since INDEX_NODE == LARGE_INDEX this check is ok for the index root
620	 * as well.
621	 */
622	if (!(ictx->index->flags & INDEX_NODE)) {
623		ntfs_error(idx_ni->vol->mp, es, "Index entry with child node "
624				"found in a leaf node",
625				(unsigned long long)idx_ni->mft_no);
626		goto err;
627	}
628	/* Get the starting vcn of the child index block to descend into. */
629	vcn = sle64_to_cpup((sle64*)((u8*)ictx->entry +
630			le16_to_cpu(ictx->entry->length) - sizeof(VCN)));
631	if (vcn < 0) {
632		ntfs_error(idx_ni->vol->mp, es, "Negative child node VCN",
633				(unsigned long long)idx_ni->mft_no);
634		goto err;
635	}
636	/* Determine the offset of the page containing the child index block. */
637	ofs = (vcn << idx_ni->vcn_size_shift) & ~PAGE_MASK_64;
638	/*
639	 * If the entry whose sub-node we are descending into is in the index
640	 * root, release the index root unlocking its node or we can deadlock
641	 * with ntfs_page_map().
642	 */
643	if (ictx->is_root) {
644		/*
645		 * As a sanity check verify that the index allocation attribute
646		 * exists.
647		 */
648		if (!NInoIndexAllocPresent(idx_ni)) {
649			ntfs_error(idx_ni->vol->mp, "No index allocation "
650					"attribute but index root entry "
651					"requires one.  Inode 0x%llx is "
652					"corrupt.  Run chkdsk.",
653					(unsigned long long)idx_ni->mft_no);
654			goto err;
655		}
656		ntfs_index_ctx_unlock(ictx);
657	} else /* if (!ictx->is_root) */ {
658		/*
659		 * If @vcn is in the same VM page as the existing page we reuse
660		 * the mapped page, otherwise we release the page so we can get
661		 * the new one.
662		 */
663		upl = ictx->upl;
664		pl = ictx->pl;
665		addr = ictx->addr;
666		is_dirty = ictx->is_dirty;
667		ictx->upl = NULL;
668		ictx->is_dirty = 0;
669		ictx->is_locked = 0;
670		if (ofs == ictx->upl_ofs)
671			goto have_page;
672		ntfs_page_unmap(idx_ni, upl, pl, is_dirty);
673		is_dirty = 0;
674	}
675	/* We did not reuse the old page, get the new page. */
676	err = ntfs_page_map(idx_ni, ofs, &upl, &pl, &addr, TRUE);
677	if (err) {
678		ntfs_error(idx_ni->vol->mp, "Failed to map index page (error "
679				"%d).", err);
680		goto err;
681	}
682have_page:
683	/* Get to the index allocation block. */
684	ia = (INDEX_ALLOCATION*)(addr + ((unsigned)(vcn <<
685			idx_ni->vcn_size_shift) & PAGE_MASK));
686	/* Bounds checks. */
687	if ((u8*)ia < addr || (u8*)ia > addr + PAGE_SIZE) {
688		ntfs_error(idx_ni->vol->mp, es, "Out of bounds check failed",
689				(unsigned long long)idx_ni->mft_no);
690		goto unm_err;
691	}
692	/* Catch multi sector transfer fixup errors. */
693	if (!ntfs_is_indx_record(ia->magic)) {
694		ntfs_error(idx_ni->vol->mp, "Index record with VCN 0x%llx is "
695				"corrupt.  Inode 0x%llx is corrupt.  Run "
696				"chkdsk.", (unsigned long long)vcn,
697				(unsigned long long)idx_ni->mft_no);
698		goto unm_err;
699	}
700	if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
701		ntfs_error(idx_ni->vol->mp, "Actual VCN (0x%llx) of index "
702				"buffer is different from expected VCN "
703				"(0x%llx).  Inode 0x%llx is corrupt.  Run "
704				"chkdsk.", (unsigned long long)
705				sle64_to_cpu(ia->index_block_vcn),
706				(unsigned long long)vcn,
707				(unsigned long long)idx_ni->mft_no);
708		goto unm_err;
709	}
710	if (offsetof(INDEX_BLOCK, index) + le32_to_cpu(
711			ia->index.allocated_size) != idx_ni->block_size) {
712		ntfs_error(idx_ni->vol->mp, "Index buffer (VCN 0x%llx) of "
713				"inode 0x%llx has a size (%u) differing from "
714				"the index specified size (%u).  Inode is "
715				"corrupt.  Run chkdsk.",
716				(unsigned long long)vcn,
717				(unsigned long long)idx_ni->mft_no,
718				(unsigned)offsetof(INDEX_BLOCK, index) +
719				le32_to_cpu(ia->index.allocated_size),
720				(unsigned)idx_ni->block_size);
721		goto unm_err;
722	}
723	if ((u8*)ia + idx_ni->block_size > addr + PAGE_SIZE)
724		panic("%s(): (u8*)ia + idx_ni->block_size > kaddr + "
725				"PAGE_SIZE\n", __FUNCTION__);
726	if ((u8*)&ia->index + le32_to_cpu(ia->index.index_length) >
727			(u8*)ia + idx_ni->block_size) {
728		ntfs_error(idx_ni->vol->mp, "Size of index buffer (VCN "
729				"0x%llx) of inode 0x%llx exceeds maximum "
730				"size.", (unsigned long long)vcn,
731				(unsigned long long)idx_ni->mft_no);
732		goto unm_err;
733	}
734	/* Allocate a new index context. */
735	new_ictx = ntfs_index_ctx_alloc();
736	if (!new_ictx) {
737		err = ENOMEM;
738		ntfs_error(idx_ni->vol->mp, "Failed to allocate index "
739				"context.");
740		goto unm_err;
741	}
742	/*
743	 * Attach the new index context between the current index context
744	 * (which is the bottom of the tree) and the index context below it
745	 * (which is the top of the tree), i.e. place the new index context at
746	 * the bottom of the tree.
747	 *
748	 * Gcc is broken so it fails to see both the members of the anonymous
749	 * union(s) and the bitfield inside the C99 initializer.  Work around
750	 * this by setting @is_locked, @is_dirty, @ia, and @vcn afterwards.
751	 */
752	*new_ictx = (ntfs_index_context) {
753		.up = ictx,
754		.down = ictx->down,
755		.idx_ni = idx_ni,
756		.base_ni = ictx->base_ni,
757		.index = &ia->index,
758		.bytes_free = le32_to_cpu(ia->index.allocated_size) -
759				le32_to_cpu(ia->index.index_length),
760		.max_entries = ictx->max_entries,
761	};
762	new_ictx->is_locked = 1;
763	new_ictx->is_dirty = is_dirty;
764	new_ictx->ia = ia;
765	new_ictx->vcn = vcn;
766	new_ictx->upl_ofs = ofs;
767	new_ictx->upl = upl;
768	new_ictx->pl = pl;
769	new_ictx->addr = addr;
770	ictx->down->up = new_ictx;
771	ictx->down = new_ictx;
772	/*
773	 * Gather the index entry pointers and finish setting up the new index
774	 * context.
775	 */
776	err = ntfs_index_get_entries(new_ictx);
777	if (!err) {
778		*index_ctx = new_ictx;
779		ntfs_debug("Done.");
780		return 0;
781	}
782	new_ictx->is_locked = 0;
783	new_ictx->is_dirty = 0;
784	new_ictx->upl = NULL;
785unm_err:
786	ntfs_page_unmap(idx_ni, upl, pl, is_dirty);
787err:
788	if (!err) {
789		NVolSetErrors(idx_ni->vol);
790		err = EIO;
791	}
792	return err;
793}
794
795/**
796 * ntfs_index_lookup_in_node - search for an entry in an index node
797 * @ictx:		index context in which to search for the entry
798 * @match_key:		index entry key data to search for
799 * @match_key_len:	length of @match_key in bytes
800 * @key:		index entry key to search for
801 * @key_len:		length of @key in bytes
802 *
803 * Perform a binary search through the index entries in the index node
804 * described by @ictx looking for the correct entry or if not found for the
805 * index entry whose sub-node pointer to descend into.
806 *
807 * @key and @key_len is the complete key to search for.  This is used when
808 * doing the full blown collation.
809 *
810 * When doing exact matching for filenames we need to compare the actual
811 * filenames rather than the filename attributes whereas for view indexes we
812 * need to compare the whole key.  To make this function simpler we let the
813 * caller specify the appropriate data to use for exact matching in @match_key
814 * and @match_key_len.  For view indexes @match_key and @match_key_len are the
815 * same as @key and @key_len respectively.
816 *
817 * Return 0 on success and errno on error.
818 *
819 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode.
820 *	    - The index context @ictx must be locked.
821 */
822static errno_t ntfs_index_lookup_in_node(ntfs_index_context *ictx,
823		const void *match_key, const int match_key_len,
824		const void *key, const int key_len)
825{
826	ntfs_inode *idx_ni;
827	INDEX_ENTRY *ie, **entries;
828	unsigned min_left, max_right, cur_entry;
829	int rc;
830	BOOL is_view;
831
832	ntfs_debug("Entering.");
833	idx_ni = ictx->idx_ni;
834	is_view = FALSE;
835	if (idx_ni->name != I30)
836		is_view = TRUE;
837	entries = ictx->entries;
838	/*
839	 * If there is only one entry, i.e. the index node is empty, we need to
840	 * descend into the sub-node pointer of the end entry if present thus
841	 * return the end entry.
842	 */
843	if (ictx->nr_entries == 1) {
844		cur_entry = 0;
845		goto not_found;
846	}
847	/*
848	 * Now do a binary search through the index entries looking for the
849	 * correct entry or if not found for the index entry whose sub-node
850	 * pointer to descend into.
851	 *
852	 * Note we exclude the end entry from the search as it does not include
853	 * a key we can compare.  That makes the search algorithm simpler and
854	 * more efficient.
855	 */
856	min_left = 0;
857	max_right = ictx->nr_entries - 2;
858	cur_entry = max_right / 2;
859	do {
860		void *ie_match_key;
861		int ie_match_key_len;
862
863		ie = entries[cur_entry];
864		if (!is_view) {
865			ie_match_key_len = ie->key.filename.filename_length <<
866					NTFSCHAR_SIZE_SHIFT;
867			ie_match_key = &ie->key.filename.filename;
868		} else {
869			ie_match_key_len = le16_to_cpu(ie->key_length);
870			ie_match_key = &ie->key;
871		}
872		/*
873		 * If the keys match perfectly, we setup @ictx to point to the
874		 * matching entry and return.
875		 */
876		if ((match_key_len == ie_match_key_len) &&
877				!bcmp(match_key, ie_match_key,
878				ie_match_key_len)) {
879found:
880			ictx->entry = ie;
881			ictx->entry_nr = cur_entry;
882			ictx->is_match = 1;
883			ntfs_debug("Done (found).");
884			return 0;
885		}
886		/*
887		 * Not a perfect match, need to do full blown collation so we
888		 * know which way in the B+tree we have to go.
889		 */
890		rc = ntfs_collate(idx_ni->vol, idx_ni->collation_rule, key,
891				key_len, &ie->key, le16_to_cpu(ie->key_length));
892		/*
893		 * If @key collates before the key of the current entry, need
894		 * to search on the left.
895		 */
896		if (rc == -1) {
897			/*
898			 * If this is the left-most entry we need to descend
899			 * into it if it has a sub-node.
900			 */
901			if (cur_entry == min_left)
902				break;
903			/* Exclude the wrong half or remaining entries. */
904			max_right = cur_entry - 1;
905			/* Find the middle of the remaining entries. */
906			cur_entry = (min_left + max_right) / 2;
907			continue;
908		}
909		/*
910		 * A match should never happen as the bcmp() call should have
911		 * caught it, but we still treat it correctly.
912		 */
913		if (!rc)
914			goto found;
915		/*
916		 * @key collates after the key of the current entry, need to
917		 * search on the right.
918		 *
919		 * If this is the right-most entry we need to descend into the
920		 * end entry on the right if it has a sub-node.
921		 */
922		if (cur_entry == max_right) {
923			cur_entry++;
924			break;
925		}
926		/* Exclude the wrong half or remaining entries. */
927		min_left = cur_entry + 1;
928		/* Find the middle of the remaining entries. */
929		cur_entry = (min_left + max_right) / 2;
930	} while (1);
931not_found:
932	ictx->follow_entry = entries[cur_entry];
933	ictx->entry_nr = cur_entry;
934	ntfs_debug("Done (not found).");
935	return ENOENT;
936}
937
938/**
939 * ntfs_index_lookup - find a key in an index and return its index entry
940 * @key:	[IN] key for which to search in the index
941 * @key_len:	[IN] length of @key in bytes
942 * @index_ctx:	[IN/OUT] context describing the index and the returned entry
943 *
944 * Before calling ntfs_index_lookup(), *@index_ctx must have been obtained
945 * from a call to ntfs_index_ctx_get().
946 *
947 * Look for the @key in the index specified by the index lookup context
948 * @index_ctx.  ntfs_index_lookup() walks the contents of the index looking for
949 * the @key.  As it does so, it records the path taken through the B+tree
950 * together with other useful information it obtains along the way.  This is
951 * needed if the lookup is going to be followed by a complex index tree
952 * operation such as an index entry addition requiring one or more index block
953 * splits for example.
954 *
955 * If the @key is found in the index, 0 is returned and *@index_ctx is set up
956 * to describe the index entry containing the matching @key.  The matching
957 * entry is pointed to by *@index_ctx->entry.
958 *
959 * If the @key is not found in the index, ENOENT is returned and *@index_ctx is
960 * setup to describe the index entry whose key collates immediately after the
961 * search @key, i.e. this is the position in the index at which an index entry
962 * with a key of @key would need to be inserted.
963 *
964 * If an error occurs return the error code.  In this case *@index_ctx is
965 * undefined and must be freed via a call to ntfs_index_ctx_put() or
966 * reinitialized via a call to ntfs_index_ctx_put_reuse().
967 *
968 * When finished with the entry and its data, call ntfs_index_ctx_put() to free
969 * the context and other associated resources.
970 *
971 * If the index entry was modified, call ntfs_index_entry_mark_dirty() before
972 * the call to ntfs_index_ctx_put() to ensure that the changes are written to
973 * disk.
974 *
975 * Locking: - Caller must hold @index_ctx->idx_ni->lock on the index inode.
976 *	    - Caller must hold an iocount reference on the index inode.
977 *
978 * TODO: An optimization would be to take two new parameters, say @add and
979 * @del, which allow our caller to tell us if the search is for purposes of
980 * adding an entry (@add is true) or for removing an entry (@del is true) or if
981 * it is a simple lookup (read-only or overwrite without change in length, both
982 * @add and @del are false).  For the lookup case we do not need to record the
983 * path so we can just use one single index context and only one array of index
984 * entry pointers and we keep reusing both instead of getting new ones each
985 * time we descend into a sub-node.  Alternatively take a single parameter
986 * @reason or @intent or something and define some constants like NTFS_LOOKUP,
987 * NTFS_ADD, and NTFS_DEL or something to go with it to serve the same purpose
988 * as above.
989 */
990errno_t ntfs_index_lookup(const void *key, const int key_len,
991		ntfs_index_context **index_ctx)
992{
993	ntfs_index_context *ictx;
994	const void *match_key;
995	int match_key_len;
996	errno_t err;
997
998	ntfs_debug("Entering.");
999	if (!key)
1000		panic("%s(): !key\n", __FUNCTION__);
1001	if (key_len <= 0)
1002		panic("%s(): key_len <= 0\n", __FUNCTION__);
1003	ictx = *index_ctx;
1004	/*
1005	 * When doing exact matching for filenames we need to compare the
1006	 * actual filenames rather than the filename attributes whereas for
1007	 * view indexes we need to compare the whole key.
1008	 */
1009	if (ictx->idx_ni->name == I30) {
1010		match_key_len = ((FILENAME_ATTR*)key)->filename_length <<
1011				NTFSCHAR_SIZE_SHIFT;
1012		match_key = &((FILENAME_ATTR*)key)->filename;
1013	} else {
1014		match_key_len = key_len;
1015		match_key = key;
1016	}
1017	/* Prepare the search context for its first lookup. */
1018	err = ntfs_index_lookup_init(ictx, key_len);
1019	if (err)
1020		goto err;
1021	do {
1022		/*
1023		 * Look for the @key in the current index node.  If found, the
1024		 * index context points to the found entry so we are done.  If
1025		 * not found, the index context points to the entry whose
1026		 * sub-node pointer we need to descend into if it is present
1027		 * and if not we have failed to find the entry and are also
1028		 * done.
1029		 */
1030		err = ntfs_index_lookup_in_node(ictx, match_key, match_key_len,
1031				key, key_len);
1032		if (err && err != ENOENT)
1033			panic("%s(): err && err != ENOENT\n", __FUNCTION__);
1034		if (!err || !(ictx->entry->flags & INDEX_ENTRY_NODE))
1035			break;
1036		/* Not found but child node present, descend into it. */
1037		err = ntfs_index_descend_into_child_node(&ictx);
1038		if (err)
1039			goto err;
1040		/*
1041		 * Replace the caller's index context with the new one so the
1042		 * caller always gets the bottom-most index context.
1043		 */
1044		*index_ctx = ictx;
1045	} while (1);
1046	ntfs_debug("Done (%s in index %s).",
1047			!err ? "found matching entry" : "entry not found",
1048			ictx->is_root ?  "root" : "allocation block");
1049	return err;
1050err:
1051	ntfs_error(ictx->idx_ni->vol->mp, "Failed (error %d).", err);
1052	return err;
1053}
1054
1055/**
1056 * ntfs_index_lookup_by_position - find an entry by its position in the B+tree
1057 * @pos:	[IN] position of index entry to find in the B+tree
1058 * @key_len:	[IN] size of the key ntfs_index_lookup() is called with in bytes
1059 * @index_ctx:	[IN/OUT] context describing the index and the returned entry
1060 *
1061 * Before calling ntfs_index_lookup_by_position(), *@index_ctx must have been
1062 * obtained from a call to ntfs_index_ctx_get().
1063 *
1064 * Start searching at the beginning of the B+tree, counting each of the entries
1065 * and when the entry with position number @pos has been reached return that in
1066 * *@index_ctx.
1067 *
1068 * @key_len is the length of the key ntfs_index_lookup() would be called with.
1069 * If the index *@index_ictx is a directory index this is ignored.
1070 *
1071 * As the search progresses, the path taken through the B+tree is recorded
1072 * together with other useful information obtained along the way.  This is
1073 * needed so the lookup can proceed to the next entry if/when desired, for
1074 * example during a ntfs_readdir() call.
1075 *
1076 * If the position @pos is found in the index, 0 is returned and *@index_ctx is
1077 * set up to describe the index entry containing at position @pos.  The
1078 * matching entry is pointed to by *@index_ctx->entry.
1079 *
1080 * If the position @pos is not found in the index, ENOENT is returned.
1081 *
1082 * If an error occurs the error code is returned (cannot be ENOENT).
1083 *
1084 * If any values other than 0 are returned, *@index_ctx is undefined and must
1085 * be freed via a call to ntfs_index_ctx_put() or reinitialized via a call to
1086 * ntfs_index_ctx_put_reuse().
1087 *
1088 * When finished with the entry and its data, call ntfs_index_ctx_put() to free
1089 * the context and other associated resources.
1090 *
1091 * If the index entry was modified, call ntfs_index_entry_mark_dirty() before
1092 * the call to ntfs_index_ctx_put() to ensure that the changes are written to
1093 * disk.
1094 *
1095 * Locking: - Caller must hold @index_ctx->idx_ni->lock on the index inode.
1096 *	    - Caller must hold an iocount reference on the index inode.
1097 */
1098errno_t ntfs_index_lookup_by_position(const s64 pos, const int key_len,
1099		ntfs_index_context **index_ctx)
1100{
1101	s64 current_pos;
1102	ntfs_index_context *ictx;
1103	errno_t err;
1104
1105	ntfs_debug("Entering for position 0x%llx.", (unsigned long long)pos);
1106	if (pos < 0)
1107		panic("%s(): pos < 0\n", __FUNCTION__);
1108	ictx = *index_ctx;
1109	/* Prepare the search context for its first lookup. */
1110	err = ntfs_index_lookup_init(ictx, key_len);
1111	if (err)
1112		goto err;
1113	/* Start at the first index entry in the index root. */
1114	ictx->entry = ictx->entries[0];
1115	ictx->entry_nr = 0;
1116	current_pos = 0;
1117	/*
1118	 * Iterate over the entries in the B+tree counting them up until we
1119	 * reach entry position @pos in which case we are done.
1120	 */
1121	do {
1122		/*
1123		 * Keep descending into the sub-node pointers until a leaf node
1124		 * is found.
1125		 */
1126		while (ictx->entry->flags & INDEX_ENTRY_NODE) {
1127			/* Child node present, descend into it. */
1128			err = ntfs_index_descend_into_child_node(&ictx);
1129			if (err)
1130				goto err;
1131			/* Start at the first index entry in the index node. */
1132			ictx->entry = ictx->entries[0];
1133			ictx->entry_nr = 0;
1134		}
1135		/*
1136		 * We have reached the next leaf node.  If @pos is in this node
1137		 * skip to the correct entry and we are done.
1138		 *
1139		 * Note we need the -1 because @ictx->nr_entries counts the end
1140		 * entry which does not contain any data thus we do not count
1141		 * it as a real entry.
1142		 */
1143		if (current_pos + ictx->nr_entries - 1 > pos) {
1144			/*
1145			 * No need to skip any entries if the current entry is
1146			 * the one matching @pos.
1147			 */
1148			if (current_pos < pos) {
1149				s64 nr;
1150
1151				nr = ictx->entry_nr + (pos - current_pos);
1152				if (nr >= ictx->nr_entries - 1)
1153					panic("%s(): nr >= ictx->nr_entries - "
1154							"1\n", __FUNCTION__);
1155				ictx->entry_nr = nr;
1156				ictx->entry = ictx->entries[nr];
1157				current_pos = pos;
1158			} else if (current_pos > pos)
1159				panic("%s(): current_pos > pos\n",
1160						__FUNCTION__);
1161			break;
1162		}
1163		/*
1164		 * @pos is not in this node.  Skip the whole node, i.e. skip
1165		 * @ictx->nr_entries - 1 real index entries.
1166		 */
1167		current_pos += ictx->nr_entries - 1;
1168		/*
1169		 * Keep moving up the B+tree until we find an index node that
1170		 * we have not finished yet.
1171		 */
1172		do {
1173			ntfs_index_context *itmp;
1174
1175			/* If we are in the index root, we are done. */
1176			if (ictx->is_root)
1177				goto not_found;
1178			/* Save the current index context so we can free it. */
1179			itmp = ictx;
1180			/* Move up to the parent node. */
1181			ictx = ictx->up;
1182			/*
1183			 * Disconnect the old index context from its path and
1184			 * free it.
1185			 */
1186			ntfs_index_ctx_disconnect(itmp);
1187			ntfs_index_ctx_put_reuse_single(itmp);
1188			ntfs_index_ctx_free(itmp);
1189		} while (ictx->entry_nr == ictx->nr_entries - 1);
1190		/*
1191		 * We have reached a node which we have not finished with yet.
1192		 * Lock it so we can work on it.
1193		 */
1194		err = ntfs_index_ctx_relock(ictx);
1195		if (err)
1196			goto err;
1197		/*
1198		 * Check if the current entry is the entry matching @pos and if
1199		 * so we are done.
1200		 */
1201		if (current_pos == pos)
1202			break;
1203		/*
1204		 * Move to the next entry in this node and continue descending
1205		 * into the sub-node pointers until a leaf node is found.  The
1206		 * first entry in the leaf node will be the next entry thus
1207		 * increment @current_pos now.
1208		 */
1209		ictx->entry = ictx->entries[++ictx->entry_nr];
1210		current_pos++;
1211	} while (1);
1212	/*
1213	 * Indicate that the current entry is the one the caller was looking
1214	 * for and return the index context containing it to the caller.
1215	 */
1216	ictx->is_match = 1;
1217	*index_ctx = ictx;
1218	ntfs_debug("Done (entry with position 0x%llx found in index %s).",
1219			(unsigned long long)pos,
1220			ictx->is_root ? "root" : "allocation block");
1221	return 0;
1222not_found:
1223	ntfs_debug("Done (entry with position 0x%llx not found in index, "
1224			"returning ENOENT).", (unsigned long long)pos);
1225	/* Ensure we return a valid index context. */
1226	*index_ctx = ictx;
1227	return ENOENT;
1228err:
1229	ntfs_error(ictx->idx_ni->vol->mp, "Failed (error %d).", err);
1230	/* Ensure we return a valid index context. */
1231	*index_ctx = ictx;
1232	return err;
1233}
1234
1235/**
1236 * ntfs_index_lookup_next - find the next entry in the B+tree
1237 * @index_ctx:	[IN/OUT] context describing the index and the returned entry
1238 *
1239 * Before calling ntfs_index_lookup_next(), *@index_ctx must have been obtained
1240 * from a call to ntfs_index_lookup() or ntfs_index_lookup_by_position().
1241 *
1242 * If the next entry is found in the index, 0 is returned and *@index_ctx is
1243 * set up to describe the next index entry.  The matching entry is pointed to
1244 * by *@index_ctx->entry.
1245 *
1246 * If the position @pos is not found in the index, ENOENT is returned.
1247 *
1248 * If an error occurs the error code is returned (cannot be ENOENT).
1249 *
1250 * If any values other than 0 are returned, *@index_ctx is undefined and must
1251 * be freed via a call to ntfs_index_ctx_put() or reinitialized via a call to
1252 * ntfs_index_ctx_put_reuse().
1253 *
1254 * When finished with the entry and its data, call ntfs_index_ctx_put() to free
1255 * the context and other associated resources.
1256 *
1257 * If the index entry was modified, call ntfs_index_entry_mark_dirty() before
1258 * the call to ntfs_index_ctx_put() to ensure that the changes are written to
1259 * disk.
1260 *
1261 * Locking: - Caller must hold @index_ctx->idx_ni->lock on the index inode.
1262 *	    - Caller must hold an iocount reference on the index inode.
1263 */
1264errno_t ntfs_index_lookup_next(ntfs_index_context **index_ctx)
1265{
1266	ntfs_index_context *ictx;
1267	errno_t err;
1268
1269	ntfs_debug("Entering.");
1270	ictx = *index_ctx;
1271	if (!ictx->base_ni)
1272		panic("%s(): !ictx->base_ni\n", __FUNCTION__);
1273	/*
1274	 * Ensure the index context is initialized, i.e. it is not legal to
1275	 * call ntfs_index_lookup_next() with a search context that has not
1276	 * been used for a call to ntfs_index_lookup_by_position() or
1277	 * ntfs_index_lookup() yet.
1278	 */
1279	if (!ictx->max_entries)
1280		panic("%s(): Called for uninitialized index context.\n",
1281				__FUNCTION__);
1282	/*
1283	 * @ictx must currently point to real entry thus @ictx->nr_entries must
1284	 * be greater or equal to 2 as it includes the end entry which cannot
1285	 * contain any data.
1286	 */
1287	if (ictx->nr_entries < 2)
1288		panic("%s(): ictx->nr_entries < 2\n", __FUNCTION__);
1289	if (!(ictx->entry->flags & INDEX_ENTRY_NODE)) {
1290		/*
1291		 * The current entry is in a leaf node.
1292		 *
1293		 * If it is not the last entry in the node return the next
1294		 * entry in the node.
1295		 */
1296		if (ictx->entry_nr < ictx->nr_entries - 2) {
1297			ictx->entry = ictx->entries[++ictx->entry_nr];
1298			ntfs_debug("Done (same leaf node).");
1299			return 0;
1300		}
1301		/*
1302		 * The current entry is in a leaf node and it is the last entry
1303		 * in the node.  The next entry is the first real entry above
1304		 * the current node thus keep moving up the B+tree until we
1305		 * find a real entry.
1306		 */
1307		do {
1308			ntfs_index_context *itmp;
1309
1310			/* If we are in the index root, we are done. */
1311			if (ictx->is_root) {
1312				ntfs_debug("Done (was at last entry already, "
1313						"returning ENOENT).");
1314				/* Ensure we return a valid index context. */
1315				*index_ctx = ictx;
1316				return ENOENT;
1317			}
1318			/* Save the current index context so we can free it. */
1319			itmp = ictx;
1320			/* Move up to the parent node. */
1321			ictx = ictx->up;
1322			/*
1323			 * Disconnect the old index context from its path and
1324			 * free it.
1325			 */
1326			ntfs_index_ctx_disconnect(itmp);
1327			ntfs_index_ctx_put_reuse_single(itmp);
1328			ntfs_index_ctx_free(itmp);
1329		} while (ictx->entry_nr == ictx->nr_entries - 1);
1330		/*
1331		 * We have reached a node with a real index entry.  Lock it so
1332		 * we can work on it.
1333		 */
1334		err = ntfs_index_ctx_relock(ictx);
1335		if (err)
1336			goto err;
1337		/*
1338		 * Indicate that the current entry is the next entry and return
1339		 * the index context containing it to the caller.
1340		 */
1341		ictx->is_match = 1;
1342		*index_ctx = ictx;
1343		ntfs_debug("Done (upper index node).");
1344		return 0;
1345	}
1346	/*
1347	 * The current entry is in an index node.
1348	 *
1349	 * To find the next entry we need to switch to the next entry in this
1350	 * node and then descend into the sub-node pointers until a leaf node
1351	 * is found.  The first entry of that leaf node is the next entry.
1352	 *
1353	 * First, unmark this node as being the matching one and switch to the
1354	 * next entry in this node.
1355	 */
1356	ictx->is_match = 0;
1357	ictx->entry = ictx->entries[++ictx->entry_nr];
1358	/*
1359	 * Keep descending into the sub-node pointers until a leaf node is
1360	 * found.
1361	 */
1362	while (ictx->entry->flags & INDEX_ENTRY_NODE) {
1363		/* Child node present, descend into it. */
1364		err = ntfs_index_descend_into_child_node(&ictx);
1365		if (err)
1366			goto err;
1367		/* Start at the first index entry in the index node. */
1368		ictx->entry = ictx->entries[0];
1369		ictx->entry_nr = 0;
1370	}
1371	/*
1372	 * We have reached the next leaf node.  The next entry is the first
1373	 * entry in this node which we have already set up to be the current
1374	 * entry.
1375	 *
1376	 * Indicate that the current entry is the one the caller was looking
1377	 * for and return the index context containing it to the caller.
1378	 */
1379	ictx->is_match = 1;
1380	*index_ctx = ictx;
1381	ntfs_debug("Done (next leaf node).");
1382	return 0;
1383err:
1384	ntfs_error(ictx->idx_ni->vol->mp, "Failed (error %d).", err);
1385	/* Ensure we return a valid index context. */
1386	*index_ctx = ictx;
1387	return err;
1388}
1389
1390/**
1391 * ntfs_index_entry_mark_dirty - mark an index entry dirty
1392 * @ictx:	ntfs index context describing the index entry
1393 *
1394 * Mark the index entry described by the index entry context @ictx dirty.
1395 *
1396 * If the index entry is in the index root attribute, simply mark the mft
1397 * record containing the index root attribute dirty.  This ensures the mft
1398 * record, and hence the index root attribute, will be written out to disk
1399 * later.
1400 *
1401 * If the index entry is in an index block belonging to the index allocation
1402 * attribute, mark the page the index block is in dirty.
1403 *
1404 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for
1405 *	      writing.
1406 *	    - The index context @ictx must be locked.
1407 */
1408void ntfs_index_entry_mark_dirty(ntfs_index_context *ictx)
1409{
1410	if (!ictx->is_locked)
1411		panic("%s(): Index context is not locked.\n", __FUNCTION__);
1412	if (ictx->is_root) {
1413		if (!ictx->actx)
1414			panic("%s(): Attribute context is missing from index "
1415					"context.\n", __FUNCTION__);
1416		NInoSetMrecNeedsDirtying(ictx->actx->ni);
1417	} else {
1418		if (!ictx->upl)
1419			panic("%s(): Page list is missing from index "
1420					"context.\n", __FUNCTION__);
1421		ictx->is_dirty = 1;
1422	}
1423}
1424
1425/**
1426 * ntfs_insert_index_allocation_attribute - add the index allocation attribute
1427 * @ni:		base ntfs inode to which the attribute is being added
1428 * @ctx:	search context describing where to insert the attribute
1429 * @idx_ni:	index inode for which to add the index allocation attribute
1430 *
1431 * Insert a new index allocation attribute in the base ntfs inode @ni at the
1432 * position indicated by the attribute search context @ctx and add an attribute
1433 * list attribute entry for it if the inode uses the attribute list attribute.
1434 *
1435 * The new attribute type and name are given by the index inode @idx_ni.
1436 *
1437 * If @idx_ni contains a runlist, it is used to create the mapping pairs array
1438 * for the non-resident index allocation attribute.
1439 *
1440 * Return 0 on success and errno on error.
1441 *
1442 * WARNING: Regardless of whether success or failure is returned, you need to
1443 *	    check @ctx->is_error and if true the @ctx is no longer valid, i.e.
1444 *	    you need to either call ntfs_attr_search_ctx_reinit() or
1445 *	    ntfs_attr_search_ctx_put() on it.  In that case @ctx->error will
1446 *	    give the error code for why the mapping of the inode failed.
1447 *
1448 * Locking: Caller must hold @idx_ni->lock on the index inode for writing.
1449 */
1450static errno_t ntfs_insert_index_allocation_attribute(ntfs_inode *ni,
1451		ntfs_attr_search_ctx *ctx, ntfs_inode *idx_ni)
1452{
1453	ntfs_volume *vol;
1454	MFT_RECORD *base_m, *m;
1455	ATTR_RECORD *a;
1456	ATTR_LIST_ENTRY *al_entry;
1457	unsigned mp_ofs, mp_size, al_entry_used, al_entry_len;
1458	unsigned new_al_size, new_al_alloc;
1459	errno_t err;
1460	BOOL al_entry_added;
1461
1462	ntfs_debug("Entering for mft_no 0x%llx, name_len 0x%x.",
1463			(unsigned long long)ni->mft_no, idx_ni->name_len);
1464	vol = idx_ni->vol;
1465	/*
1466	 * Calculate the offset into the new attribute at which the mapping
1467	 * pairs array begins.  The mapping pairs array is placed after the
1468	 * name aligned to an 8-byte boundary which in turn is placed
1469	 * immediately after the non-resident attribute record itself.
1470	 */
1471	mp_ofs = offsetof(ATTR_RECORD, compressed_size) +
1472			(((idx_ni->name_len << NTFSCHAR_SIZE_SHIFT) + 7) & ~7);
1473	/* Work out the size for the mapping pairs array. */
1474	err = ntfs_get_size_for_mapping_pairs(vol,
1475			idx_ni->rl.elements ? idx_ni->rl.rl : NULL, 0, -1,
1476			&mp_size);
1477	if (err)
1478		panic("%s(): err\n", __FUNCTION__);
1479	/*
1480	 * Work out the size for the attribute record.  We simply take the
1481	 * offset to the mapping pairs array we worked out above and add the
1482	 * size of the mapping pairs array in bytes aligned to an 8-byte
1483	 * boundary.  Note we do not need to do the alignment as
1484	 * ntfs_attr_record_make_space() does it anyway.
1485	 *
1486	 * The current implementation of ntfs_attr_lookup() will always return
1487	 * pointing into the base mft record when an attribute is not found.
1488	 */
1489	base_m = ctx->m;
1490retry:
1491	if (ni != ctx->ni)
1492		panic("%s(): ni != ctx->ni\n", __FUNCTION__);
1493	m = ctx->m;
1494	a = ctx->a;
1495	err = ntfs_attr_record_make_space(m, a, mp_ofs + mp_size);
1496	if (err) {
1497		ntfs_inode *eni;
1498
1499		if (err != ENOSPC)
1500			panic("%s(): err != ENOSPC\n", __FUNCTION__);
1501		/*
1502		 * There was not enough space in the mft record to insert the
1503		 * new attribute record which means we will need to insert it
1504		 * into an extent mft record.
1505		 *
1506		 * To avoid bugs and impossible situations, check that the
1507		 * attribute is not already the only attribute in the mft
1508		 * record otherwise moving it would not give us anything.
1509		 */
1510		if (ntfs_attr_record_is_only_one(m, a))
1511			panic("%s(): ntfs_attr_record_is_only_one()\n",
1512					__FUNCTION__);
1513		/*
1514		 * Before we can allocate an extent mft record, we need to
1515		 * ensure that the inode has an attribute list attribute.
1516		 */
1517		if (!NInoAttrList(ni)) {
1518			err = ntfs_attr_list_add(ni, m, NULL);
1519			if (err) {
1520				ntfs_error(vol->mp, "Failed to add attribute "
1521						"list attribute to mft_no "
1522						"0x%llx (error %d).",
1523						(unsigned long long)ni->mft_no,
1524						err);
1525				return err;
1526			}
1527			/*
1528			 * Adding the attribute list attribute may have
1529			 * generated enough space in the base mft record to
1530			 * fit the attribute so try again.
1531			 */
1532			ntfs_attr_search_ctx_reinit(ctx);
1533			err = ntfs_attr_lookup(AT_INDEX_ALLOCATION,
1534					idx_ni->name, idx_ni->name_len, 0,
1535					NULL, 0, ctx);
1536			if (err == ENOENT) {
1537				/*
1538				 * The current implementation of
1539				 * ntfs_attr_lookup() will always return
1540				 * pointing into the base mft record when an
1541				 * attribute is not found.
1542				 */
1543				if (m != ctx->m)
1544					panic("%s(): m != ctx->m\n",
1545							__FUNCTION__);
1546				goto retry;
1547			}
1548			/*
1549			 * We cannot have found the attribute as we have
1550			 * exclusive access and know that it does not exist
1551			 * already.
1552			 */
1553			if (!err)
1554				panic("%s() !err\n", __FUNCTION__);
1555			/*
1556			 * Something has gone wrong.  Note we have to bail out
1557			 * as a failing attribute lookup indicates corruption
1558			 * and/or disk failure and/or not enough memory all of
1559			 * which would prevent us from rolling back the
1560			 * attribute list attribute addition.
1561			 */
1562			ntfs_error(vol->mp, "Failed to add index allocation "
1563					"attribute to mft_no 0x%llx because "
1564					"looking up the attribute failed "
1565					"(error %d).",
1566					(unsigned long long)ni->mft_no, err);
1567			return err;
1568		}
1569		/*
1570		 * We now need to allocate a new extent mft record, attach it
1571		 * to the base ntfs inode and set up the search context to
1572		 * point to it, then insert the new attribute into it.
1573		 */
1574		err = ntfs_mft_record_alloc(vol, NULL, NULL, ni, &eni, &m, &a);
1575		if (err) {
1576			ntfs_error(vol->mp, "Failed to add index allocation "
1577					"attribute to mft_no 0x%llx because "
1578					"allocating a new extent mft record "
1579					"failed (error %d).",
1580					(unsigned long long)ni->mft_no, err);
1581			/*
1582			 * TODO: If we added the attribute list attribute above
1583			 * we could now remove it again but this may require
1584			 * moving attributes back into the base mft record so
1585			 * is not a trivial amount of work and in the end it
1586			 * does not really matter if we leave an inode with an
1587			 * attribute list attribute that does not really need
1588			 * it.  Especially so since this is error handling and
1589			 * it is not an error to have an attribute list
1590			 * attribute when not strictly required.
1591			 */
1592			return err;
1593		}
1594		ctx->ni = eni;
1595		ctx->m = m;
1596		ctx->a = a;
1597		/*
1598		 * Make space for the new attribute.  This cannot fail as we
1599		 * now have an empty mft record which by definition can hold
1600		 * a maximum size resident attribute record.
1601		 */
1602		err = ntfs_attr_record_make_space(m, a, mp_ofs + mp_size);
1603		if (err)
1604			panic("%s(): err (ntfs_attr_record_make_space())\n",
1605					__FUNCTION__);
1606	}
1607	/*
1608	 * Now setup the new attribute record.  The entire attribute has been
1609	 * zeroed and the length of the attribute record has been set up by
1610	 * ntfs_attr_record_make_space().
1611	 */
1612	a->type = AT_INDEX_ALLOCATION;
1613	a->non_resident = 1;
1614	a->name_length = idx_ni->name_len;
1615	a->name_offset = const_cpu_to_le16(offsetof(ATTR_RECORD,
1616			compressed_size));
1617	a->instance = m->next_attr_instance;
1618	/*
1619	 * Increment the next attribute instance number in the mft record as we
1620	 * consumed the old one.
1621	 */
1622	m->next_attr_instance = cpu_to_le16(
1623			(le16_to_cpu(m->next_attr_instance) + 1) & 0xffff);
1624	a->highest_vcn = cpu_to_sle64((idx_ni->allocated_size >>
1625			vol->cluster_size_shift) - 1);
1626	a->mapping_pairs_offset = cpu_to_le16(mp_ofs);
1627	lck_spin_lock(&idx_ni->size_lock);
1628	a->allocated_size = cpu_to_sle64(idx_ni->allocated_size);
1629	a->data_size = cpu_to_sle64(idx_ni->data_size);
1630	a->initialized_size = cpu_to_sle64(idx_ni->initialized_size);
1631	lck_spin_unlock(&idx_ni->size_lock);
1632	/* Copy the attribute name into place. */
1633	if (idx_ni->name_len)
1634		memcpy((u8*)a + offsetof(ATTR_RECORD, compressed_size),
1635				idx_ni->name, idx_ni->name_len <<
1636				NTFSCHAR_SIZE_SHIFT);
1637	/* Generate the mapping pairs array into place. */
1638	err = ntfs_mapping_pairs_build(vol, (s8*)a + mp_ofs, mp_size,
1639			idx_ni->rl.elements ? idx_ni->rl.rl : NULL, 0,
1640			-1, NULL);
1641	if (err)
1642		panic("%s(): err (ntfs_mapping_pairs_build())\n", __FUNCTION__);
1643	/*
1644	 * If the inode does not use the attribute list attribute we are done.
1645	 *
1646	 * If the inode uses the attribute list attribute (including the case
1647	 * where we just created it), we need to add an attribute list
1648	 * attribute entry for the attribute.
1649	 */
1650	if (!NInoAttrList(ni))
1651		goto done;
1652	/* Add an attribute list attribute entry for the inserted attribute. */
1653	al_entry = ctx->al_entry;
1654	al_entry_used = offsetof(ATTR_LIST_ENTRY, name) +
1655			(idx_ni->name_len << NTFSCHAR_SIZE_SHIFT);
1656	al_entry_len = (al_entry_used + 7) & ~7;
1657	new_al_size = ni->attr_list_size + al_entry_len;
1658	/* Out of bounds checks. */
1659	if ((u8*)al_entry < ni->attr_list ||
1660			(u8*)al_entry > ni->attr_list + new_al_size ||
1661			(u8*)al_entry + al_entry_len >
1662			ni->attr_list + new_al_size) {
1663		/* Inode is corrupt. */
1664		ntfs_error(vol->mp, "Mft_no 0x%llx is corrupt.  Run chkdsk.",
1665				(unsigned long long)ni->mft_no);
1666		err = EIO;
1667		goto undo;
1668	}
1669	err = ntfs_attr_size_bounds_check(vol, AT_ATTRIBUTE_LIST, new_al_size);
1670	if (err) {
1671		if (err == ERANGE) {
1672			ntfs_error(vol->mp, "Cannot insert attribute into "
1673					"mft_no 0x%llx because the attribute "
1674					"list attribute would become too "
1675					"large.  You need to defragment your "
1676					"volume and then try again.",
1677					(unsigned long long)ni->mft_no);
1678			err = ENOSPC;
1679		} else {
1680			ntfs_error(vol->mp, "Attribute list attribute is "
1681					"unknown on the volume.  The volume "
1682					"is corrupt.  Run chkdsk.");
1683			NVolSetErrors(vol);
1684			err = EIO;
1685		}
1686		goto undo;
1687	}
1688	new_al_alloc = (new_al_size + NTFS_ALLOC_BLOCK - 1) &
1689			~(NTFS_ALLOC_BLOCK - 1);
1690	/*
1691	 * Reallocate the memory buffer if needed and create space for the new
1692	 * entry.
1693	 */
1694	if (new_al_alloc > ni->attr_list_alloc) {
1695		u8 *tmp, *al, *al_end;
1696		unsigned al_entry_ofs;
1697
1698		tmp = OSMalloc(new_al_alloc, ntfs_malloc_tag);
1699		if (!tmp) {
1700			ntfs_error(vol->mp, "Not enough memory to extend "
1701					"attribute list attribute of mft_no "
1702					"0x%llx.",
1703					(unsigned long long)ni->mft_no);
1704			err = ENOMEM;
1705			goto undo;
1706		}
1707		al = ni->attr_list;
1708		al_entry_ofs = (u8*)al_entry - al;
1709		al_end = al + ni->attr_list_size;
1710		memcpy(tmp, al, al_entry_ofs);
1711		if ((u8*)al_entry < al_end)
1712			memcpy(tmp + al_entry_ofs + al_entry_len,
1713					al + al_entry_ofs,
1714					ni->attr_list_size - al_entry_ofs);
1715		al_entry = ctx->al_entry = (ATTR_LIST_ENTRY*)(tmp +
1716				al_entry_ofs);
1717		OSFree(ni->attr_list, ni->attr_list_alloc, ntfs_malloc_tag);
1718		ni->attr_list_alloc = new_al_alloc;
1719		ni->attr_list = tmp;
1720	} else if ((u8*)al_entry < ni->attr_list + ni->attr_list_size)
1721		memmove((u8*)al_entry + al_entry_len, al_entry,
1722				ni->attr_list_size - ((u8*)al_entry -
1723				ni->attr_list));
1724	ni->attr_list_size = new_al_size;
1725	/* Set up the attribute list entry. */
1726	al_entry->type = AT_INDEX_ALLOCATION;
1727	al_entry->length = cpu_to_le16(al_entry_len);
1728	al_entry->name_length = idx_ni->name_len;
1729	al_entry->name_offset = offsetof(ATTR_LIST_ENTRY, name);
1730	al_entry->lowest_vcn = 0;
1731	al_entry->mft_reference = MK_LE_MREF(ctx->ni->mft_no, ctx->ni->seq_no);
1732	al_entry->instance = a->instance;
1733	/* Copy the attribute name into place. */
1734	if (idx_ni->name_len)
1735		memcpy((u8*)&al_entry->name, idx_ni->name,
1736				idx_ni->name_len << NTFSCHAR_SIZE_SHIFT);
1737	/* For tidyness, zero any unused space. */
1738	if (al_entry_len != al_entry_used) {
1739		if (al_entry_len < al_entry_used)
1740			panic("%s(): al_entry_len < al_entry_used\n",
1741					__FUNCTION__);
1742		memset((u8*)al_entry + al_entry_used, 0,
1743				al_entry_len - al_entry_used);
1744	}
1745	/*
1746	 * Extend the attribute list attribute and copy in the modified
1747	 * value from the cache.
1748	 */
1749	err = ntfs_attr_list_sync_extend(ni, base_m,
1750			(u8*)al_entry - ni->attr_list, ctx);
1751	if (err) {
1752		ntfs_error(vol->mp, "Failed to extend attribute list "
1753				"attribute of mft_no 0x%llx (error %d).",
1754				(unsigned long long)ni->mft_no, err);
1755		al_entry_added = TRUE;
1756		goto undo_al;
1757	}
1758done:
1759	ntfs_debug("Done.");
1760	return 0;
1761undo:
1762	al_entry_added = FALSE;
1763undo_al:
1764	/*
1765	 * Need to remove the attribute again or free the extent mft record if
1766	 * there are no attributes remaining in it.
1767	 */
1768	if (m == base_m || !ntfs_attr_record_is_only_one(m, a)) {
1769		ntfs_attr_record_delete_internal(m, a);
1770		/*
1771		 * If the attribute was not in the base mft record mark the
1772		 * extent mft record dirty so it gets written out later.  If
1773		 * the attribute was in the base mft record it will be marked
1774		 * dirty later.
1775		 *
1776		 * We also unmap the extent mft record and we set @ctx->ni to
1777		 * equal the base inode @ni so that the search context is
1778		 * initialized from scratch or simply freed if the caller
1779		 * reinitializes or releases the search context respectively.
1780		 */
1781		if (m != base_m) {
1782			NInoSetMrecNeedsDirtying(ctx->ni);
1783			ntfs_extent_mft_record_unmap(ctx->ni);
1784			ctx->ni = ni;
1785		}
1786	} else {
1787		errno_t err2;
1788		BOOL al_needed;
1789
1790		err2 = ntfs_extent_mft_record_free(ni, ctx->ni, m);
1791		if (err2) {
1792			/*
1793			 * Ignore the error as we just end up with an unused
1794			 * mft record that is marked in use.
1795			 */
1796			ntfs_error(vol->mp, "Failed to free extent mft_no "
1797					"0x%llx (error %d).  Unmount and run "
1798					"chkdsk to recover the lost inode.",
1799					(unsigned long long)ctx->ni->mft_no,
1800					err2);
1801			NVolSetErrors(vol);
1802			/*
1803			 * Relese the extent mft record after dirtying it thus
1804			 * simulating the effect of freeing it.
1805			 */
1806			NInoSetMrecNeedsDirtying(ctx->ni);
1807			ntfs_extent_mft_record_unmap(ctx->ni);
1808		}
1809		/*
1810		 * The attribute search context still points to the no longer
1811		 * mapped extent inode thus we need to change it to point to
1812		 * the base inode instead so the context can be reinitialized
1813		 * or released safely.
1814		 */
1815		ctx->ni = ni;
1816		/*
1817		 * Check the attribute list attribute.  If there are no other
1818		 * attribute list attribute entries referencing extent mft
1819		 * records delete the attribute list attribute altogether.
1820		 *
1821		 * If this fails it does not matter as we simply retain the
1822		 * attribute list attribute so we ignore the error and go on to
1823		 * delete the attribute list attribute entry instead.
1824		 *
1825		 * If there are other attribute list attribute entries
1826		 * referencing extent mft records we still need the attribute
1827		 * list attribute thus we go on to delete the attribute list
1828		 * entry corresponding to the attribute record we just deleted
1829		 * by freeing its extent mft record.
1830		 */
1831		err2 = ntfs_attr_list_is_needed(ni,
1832				al_entry_added ? al_entry : NULL, &al_needed);
1833		if (err2)
1834			ntfs_warning(vol->mp, "Failed to determine if "
1835					"attribute list attribute of mft_no "
1836					"0x%llx if still needed (error %d).  "
1837					"Assuming it is still needed and "
1838					"continuing.",
1839					(unsigned long long)ni->mft_no, err2);
1840		else if (!al_needed) {
1841			/*
1842			 * No more extent mft records are in use.  Delete the
1843			 * attribute list attribute.
1844			 */
1845			ntfs_attr_search_ctx_reinit(ctx);
1846			err2 = ntfs_attr_list_delete(ni, ctx);
1847			if (!err2) {
1848				/*
1849				 * We deleted the attribute list attribute and
1850				 * this will have updated the base inode
1851				 * appropriately thus we have restored
1852				 * everything as it was before.
1853				 */
1854				return err;
1855			}
1856			ntfs_warning(vol->mp, "Failed to delete attribute "
1857					"list attribute of mft_no 0x%llx "
1858					"(error %d).  Continuing using "
1859					"alternative error recovery method.",
1860					(unsigned long long)ni->mft_no, err2);
1861		}
1862	}
1863	/*
1864	 * Both @ctx and @ni are now invalid and cannot be used any more which
1865	 * is fine as we have finished dealing with the attribute record.
1866	 *
1867	 * We now need to delete the corresponding attribute list attribute
1868	 * entry if we created it.
1869	 *
1870	 * Then we need to rewrite the attribute list attribute again because
1871	 * ntfs_attr_list_sync_extend() may have left it in an indeterminate
1872	 * state.
1873	 */
1874	if (al_entry_added) {
1875		errno_t err2;
1876
1877		ntfs_attr_list_entry_delete(ni, al_entry);
1878		ntfs_attr_search_ctx_reinit(ctx);
1879		err2 = ntfs_attr_list_sync_shrink(ni, 0, ctx);
1880		if (err2) {
1881			ntfs_error(vol->mp, "Failed to restore attribute list "
1882					"attribute in base mft_no 0x%llx "
1883					"(error %d).  Leaving inconsistent "
1884					"metadata.  Unmount and run chkdsk.",
1885					(unsigned long long)ni->mft_no, err2);
1886			NVolSetErrors(vol);
1887		}
1888	}
1889	/* Make sure any changes are written out. */
1890	NInoSetMrecNeedsDirtying(ni);
1891	return err;
1892}
1893
1894/**
1895 * ntfs_index_block_lay_out - lay out an index block into a memory buffer
1896 * @idx_ni:	index inode to which the index block will belong
1897 * @vcn:	vcn of index block
1898 * @ia:		destination buffer of size >= @idx_ni->block_size bytes
1899 *
1900 * Lay out an empty index allocation block with the @vcn into the buffer @ia.
1901 * The index inode @idx_ni is needed because we need to know the size of an
1902 * index block and the @vcn is needed because we need to record it in the index
1903 * block.
1904 *
1905 * Locking: Caller must hold @idx_ni->lock on the index inode for writing.
1906 */
1907static void ntfs_index_block_lay_out(ntfs_inode *idx_ni, VCN vcn,
1908		INDEX_BLOCK *ia)
1909{
1910	INDEX_HEADER *ih;
1911	INDEX_ENTRY *ie;
1912	u32 ie_ofs;
1913
1914	bzero(ia, idx_ni->block_size);
1915	ia->magic = magic_INDX;
1916	ia->usa_ofs = const_cpu_to_le16(sizeof(INDEX_BLOCK));
1917	ia->usa_count = cpu_to_le16(1 + (idx_ni->block_size / NTFS_BLOCK_SIZE));
1918	/* Set the update sequence number to 1. */
1919	*(le16*)((u8*)ia + le16_to_cpu(ia->usa_ofs)) = cpu_to_le16(1);
1920	ia->index_block_vcn = cpu_to_sle64(vcn);
1921	ih = &ia->index;
1922	ie_ofs = (sizeof(INDEX_HEADER) +
1923			(le16_to_cpu(ia->usa_count) << 1) + 7) & ~7;
1924	ih->entries_offset = cpu_to_le32(ie_ofs);
1925	ih->index_length = cpu_to_le32(ie_ofs + sizeof(INDEX_ENTRY_HEADER));
1926	ih->allocated_size = cpu_to_le32(idx_ni->block_size -
1927			offsetof(INDEX_BLOCK, index));
1928	ih->flags = LEAF_NODE;
1929	ie = (INDEX_ENTRY*)((u8*)ih + ie_ofs);
1930	ie->length = const_cpu_to_le16(sizeof(INDEX_ENTRY_HEADER));
1931	ie->flags = INDEX_ENTRY_END;
1932}
1933
1934/**
1935 * ntfs_index_block_alloc - allocate and return an index allocation block
1936 * @ictx:	index context of the index for which to allocate a block
1937 * @dst_vcn:	pointer in which to return the VCN of the allocated index block
1938 * @dst_ia:	pointer in which to return the allocated index block
1939 * @dst_upl_ofs: pointer in which to return the mapped address of the page data
1940 * @dst_upl:	pointer in which to return the page list containing @ia
1941 * @dst_pl:	pointer in which to return the array of pages containing @ia
1942 * @dst_addr:	pointer in which to return the mapped address
1943 *
1944 * Allocate an index allocation block for the index described by the index
1945 * context @ictx and return it in *@dst_ia.  *@dst_vcn, *@dst_upl_ofs,
1946 * *@dst_upl, *@dst_pl, and *@dst_addr are set to the VCN of the allocated
1947 * index block and the mapped page list and array of pages containing the
1948 * returned index block respectively.
1949 *
1950 * Return 0 on success and errno on error.  On error *@dst_vcn, *@dst_ia,
1951 * *@dst_upl_ofs, *@dst_upl, *@dst_pl, and *@dst_addr are left untouched.
1952 *
1953 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for
1954 *	      writing.
1955 *	    - All of the index contexts in the index must be unlocked (this
1956 *	      includes @ictx, i.e. @ictx must not be locked).
1957 */
1958static errno_t ntfs_index_block_alloc(ntfs_index_context *ictx, VCN *dst_vcn,
1959		INDEX_BLOCK **dst_ia, s64 *dst_upl_ofs, upl_t *dst_upl,
1960		upl_page_info_array_t *dst_pl, u8 **dst_addr)
1961{
1962	s64 bmp_pos, end_pos, init_size, upl_ofs;
1963	ntfs_inode *bmp_ni, *idx_ni = ictx->idx_ni;
1964	upl_t upl;
1965	upl_page_info_array_t pl;
1966	u8 *bmp, *bmp_end;
1967	INDEX_ALLOCATION *ia;
1968	errno_t err, err2;
1969	lck_rw_type_t lock;
1970	le16 usn;
1971	BOOL have_resized;
1972	u8 bit;
1973
1974	ntfs_debug("Entering for inode 0x%llx.",
1975			(unsigned long long)idx_ni->mft_no);
1976	/*
1977	 * Get the index bitmap inode.
1978	 *
1979	 * Note we do not lock the bitmap inode as the caller holds an
1980	 * exclusive lock on the index inode thus the bitmap cannot change
1981	 * under us as the only places the index bitmap is modified is as a
1982	 * side effect of a locked index inode.
1983	 *
1984	 * The inode will be locked later if/when we are going to modify its
1985	 * size to ensure any relevant VM activity that may be happening at the
1986	 * same time is excluded.
1987	 *
1988	 * We need to take the ntfs inode lock on the bitmap inode for writing.
1989	 * This causes a complication because there is a valid case in which we
1990	 * can recurse (once) back into ntfs_index_block_alloc() by calling:
1991	 * ntfs_attr_resize(bmp_ni) ->
1992	 *   ntfs_index_move_root_to_allocation_block() ->
1993	 *     ntfs_index_block_alloc()
1994	 * In this case we have to not take the lock again or we will deadlock
1995	 * (or with lock debugging enabled the kernel will panic()).  Thus we
1996	 * set @ictx->bmp_is_locked so that when we get back here we know not
1997	 * to take the lock again.
1998	 */
1999	lock = 0;
2000	if (!ictx->bmp_is_locked) {
2001		lock = LCK_RW_TYPE_EXCLUSIVE;
2002		ictx->bmp_is_locked = 1;
2003	}
2004	err = ntfs_attr_inode_get(ictx->base_ni, AT_BITMAP, idx_ni->name,
2005			idx_ni->name_len, FALSE, lock, &bmp_ni);
2006	if (err) {
2007		ntfs_error(idx_ni->vol->mp, "Failed to get index bitmap inode "
2008				"(error %d).", err);
2009		if (lock)
2010			ictx->bmp_is_locked = 0;
2011		goto err;
2012	}
2013	/* Find the first zero bit in the index bitmap. */
2014	bmp_pos = 0;
2015	lck_spin_lock(&bmp_ni->size_lock);
2016	end_pos = bmp_ni->initialized_size;
2017	lck_spin_unlock(&bmp_ni->size_lock);
2018	for (bmp_pos = 0; bmp_pos < end_pos;) {
2019		err = ntfs_page_map(bmp_ni, bmp_pos, &upl, &pl, &bmp, TRUE);
2020		if (err) {
2021			ntfs_error(idx_ni->vol->mp, "Failed to read index "
2022					"bitmap (error %d).", err);
2023			goto put_err;
2024		}
2025		bmp_end = bmp + PAGE_SIZE;
2026		if (bmp_pos + PAGE_SIZE > end_pos)
2027			bmp_end = bmp + (end_pos - bmp_pos);
2028		/* Check the next bit(s). */
2029		for (; bmp < bmp_end; bmp++, bmp_pos++) {
2030			if (*bmp == 0xff)
2031				continue;
2032			/*
2033			 * TODO: There does not appear to be a ffz() function
2034			 * in the kernel. )-:  If/when the kernel has an ffz()
2035			 * function, switch the below code to use it.
2036			 *
2037			 * So emulate "ffz(x)" using "ffs(~x) - 1" which gives
2038			 * the same result but incurs extra CPU overhead.
2039			 */
2040			bit = ffs(~(unsigned long)*bmp) - 1;
2041			if (bit < 8)
2042				goto allocated_bit;
2043		}
2044		ntfs_page_unmap(bmp_ni, upl, pl, FALSE);
2045	}
2046	/*
2047	 * There are no zero bits in the initialized part of the bitmap.  Thus
2048	 * we extend it by 8 bytes and allocate the first bit of the extension.
2049	 */
2050	bmp_pos = end_pos;
2051	lck_spin_lock(&bmp_ni->size_lock);
2052	end_pos = bmp_ni->data_size;
2053	lck_spin_unlock(&bmp_ni->size_lock);
2054	/* If we are exceeding the bitmap size need to extend it. */
2055	have_resized = FALSE;
2056	if (bmp_pos + 8 >= end_pos) {
2057		ntfs_debug("Extending index bitmap, old size 0x%llx, "
2058				"requested size 0x%llx.",
2059				(unsigned long long)end_pos,
2060				(unsigned long long)end_pos + 8);
2061		err = ntfs_attr_resize(bmp_ni, end_pos + 8, 0, ictx);
2062		if (err) {
2063			ntfs_error(idx_ni->vol->mp, "Failed to extend index "
2064					"bitmap (error %d).", err);
2065			goto put_err;
2066		}
2067		have_resized = TRUE;
2068	}
2069	/*
2070	 * Get the page containing the bit we are allocating.  Note this has to
2071	 * happen before we call ntfs_attr_set_initialized_size() as the latter
2072	 * only sets the initialized size without zeroing the area between the
2073	 * old initialized size and the new one thus we need to map the page
2074	 * now so that its tail is zeroed due to the old value of the
2075	 * initialized size.
2076	 */
2077	err = ntfs_page_map(bmp_ni, bmp_pos & ~PAGE_MASK_64, &upl, &pl, &bmp,
2078			TRUE);
2079	if (err) {
2080		ntfs_error(idx_ni->vol->mp, "Failed to read index bitmap "
2081				"(error %d).", err);
2082		/*
2083		 * There is no need to resize the bitmap back to its old size.
2084		 * It does not change the metadata consistency to have a bigger
2085		 * bitmap.
2086		 */
2087		goto put_err;
2088	}
2089	bmp += (unsigned)bmp_pos & PAGE_MASK;
2090	/*
2091	 * Extend the initialized size if the bitmap is non-resident.  If it is
2092	 * resident this was already done by the ntfs_attr_resize() call.
2093	 */
2094	if (NInoNonResident(bmp_ni)) {
2095#ifdef DEBUG
2096		lck_spin_lock(&bmp_ni->size_lock);
2097		init_size = bmp_ni->initialized_size;
2098		lck_spin_unlock(&bmp_ni->size_lock);
2099		ntfs_debug("Setting initialized size of index bitmap, old "
2100				"size 0x%llx, requested size 0x%llx.",
2101				(unsigned long long)init_size,
2102				(unsigned long long)end_pos + 8);
2103#endif
2104		err = ntfs_attr_set_initialized_size(bmp_ni, end_pos + 8);
2105		if (err) {
2106			ntfs_error(idx_ni->vol->mp, "Failed to update "
2107					"initialized size of index bitmap "
2108					"(error %d).", err);
2109			ntfs_page_unmap(bmp_ni, upl, pl, FALSE);
2110			goto put_err;
2111		}
2112	}
2113	/* Finally, allocate the bit in the page. */
2114	bit = 0;
2115	/*
2116	 * If we used ntfs_attr_resize() to extend the bitmap attribute it is
2117	 * possible that this caused the index root node to be moved out to an
2118	 * index allocation block which very likely will have allocated the
2119	 * very bit we want to allocate so we test for this case and if it has
2120	 * happened we allocate the next bit along which must be free or
2121	 * something has gone wrong.
2122	 */
2123	if (have_resized && *bmp & 1) {
2124		if (*bmp & 2)
2125			panic("%s(): *bmp & 2\n", __FUNCTION__);
2126		bit++;
2127	}
2128allocated_bit:
2129	*bmp |= 1 << bit;
2130	ntfs_page_unmap(bmp_ni, upl, pl, TRUE);
2131	/* Set @bmp_pos to the allocated index bitmap bit. */
2132	bmp_pos = (bmp_pos << 3) + bit;
2133	/*
2134	 * If we are caching the last set bit in the bitmap in the index inode
2135	 * and we allocated beyond the last set bit, update the cached value.
2136	 */
2137	if (idx_ni->last_set_bit >= 0 && bmp_pos > idx_ni->last_set_bit)
2138		idx_ni->last_set_bit = bmp_pos;
2139	ntfs_debug("Allocated index bitmap bit 0x%llx.", bmp_pos);
2140	/*
2141	 * We are done with the bitmap and have the index allocation to go.
2142	 *
2143	 * If the allocated bit is outside the data size need to extend it.
2144	 */
2145	lck_spin_lock(&idx_ni->size_lock);
2146	end_pos = idx_ni->data_size;
2147	lck_spin_unlock(&idx_ni->size_lock);
2148	if (bmp_pos >= end_pos >> idx_ni->block_size_shift) {
2149		ntfs_debug("Extending index allocation, old size 0x%llx, "
2150				"requested size 0x%llx.", (unsigned long long)
2151				end_pos, (unsigned long long)(bmp_pos + 1) <<
2152				idx_ni->block_size_shift);
2153		err = ntfs_attr_resize(idx_ni, (bmp_pos + 1) <<
2154				idx_ni->block_size_shift, 0, ictx);
2155		if (err) {
2156			ntfs_error(idx_ni->vol->mp, "Failed to extend index "
2157					"allocation (error %d).", err);
2158			goto alloc_err;
2159		}
2160	}
2161	/*
2162	 * Map the page containing the allocated index block.  As above for the
2163	 * bitmap we need to get the page before we set the initialized size so
2164	 * that the tail of the page is zeroed for us.
2165	 */
2166	upl_ofs = (bmp_pos << idx_ni->block_size_shift) & ~PAGE_MASK_64;
2167	err = ntfs_page_map(idx_ni, upl_ofs, &upl, &pl, &bmp, TRUE);
2168	if (err) {
2169		ntfs_error(idx_ni->vol->mp, "Failed to read index allocation "
2170				"block (error %d).", err);
2171		/*
2172		 * There is no need to truncate the index allocation back to
2173		 * its old size.  It does not change the metadata consistency
2174		 * to have a bigger index allocation.
2175		 */
2176		goto alloc_err;
2177	}
2178	/* Extend the initialized size if needed. */
2179	lck_spin_lock(&idx_ni->size_lock);
2180	init_size = idx_ni->initialized_size;
2181	lck_spin_unlock(&idx_ni->size_lock);
2182	if (bmp_pos >= init_size >> idx_ni->block_size_shift) {
2183		ntfs_debug("Setting initialized size of index allocation, old "
2184				"size 0x%llx, requested size 0x%llx.",
2185				(unsigned long long)init_size,
2186				(unsigned long long)(bmp_pos + 1) <<
2187				idx_ni->block_size_shift);
2188		err = ntfs_attr_set_initialized_size(idx_ni,
2189				(bmp_pos + 1) << idx_ni->block_size_shift);
2190		if (err) {
2191			ntfs_error(idx_ni->vol->mp, "Failed to update "
2192					"initialized size of index allocation "
2193					"(error %d).", err);
2194			goto unm_err;
2195		}
2196	}
2197	if (lock) {
2198		ictx->bmp_is_locked = 0;
2199		lck_rw_unlock_exclusive(&bmp_ni->lock);
2200	}
2201	(void)vnode_put(bmp_ni->vn);
2202	/* Lay out an empty index block into the allocated space. */
2203	ia = (INDEX_ALLOCATION*)(bmp + (((unsigned)bmp_pos <<
2204			idx_ni->block_size_shift) & PAGE_MASK));
2205	/* Preserve the update sequence number across the layout. */
2206	usn = 0;
2207	if (le16_to_cpu(ia->usa_ofs) < NTFS_BLOCK_SIZE - sizeof(u16))
2208		usn = *(le16*)((u8*)ia + le16_to_cpu(ia->usa_ofs));
2209	/* Calculate the index block vcn from the index block number. */
2210	*dst_vcn = bmp_pos = bmp_pos << idx_ni->block_size_shift >>
2211			idx_ni->vcn_size_shift;
2212	ntfs_index_block_lay_out(idx_ni, bmp_pos, ia);
2213	if (usn && usn != const_cpu_to_le16(0xffff))
2214		*(le16*)((u8*)ia + le16_to_cpu(ia->usa_ofs)) = usn;
2215	*dst_ia = ia;
2216	*dst_upl_ofs = upl_ofs;
2217	*dst_upl = upl;
2218	*dst_pl = pl;
2219	*dst_addr = bmp;
2220	ntfs_debug("Done (allocated index block with vcn 0x%llx.",
2221			(unsigned long long)bmp_pos);
2222	return 0;
2223unm_err:
2224	ntfs_page_unmap(idx_ni, upl, pl, FALSE);
2225alloc_err:
2226	/* Free the index bitmap bit that we allocated. */
2227	err2 = ntfs_bitmap_clear_bit(bmp_ni, bmp_pos);
2228	if (err2) {
2229		ntfs_error(idx_ni->vol->mp, "Failed to undo index block "
2230				"allocation in index bitmap (error %d).  "
2231				"Leaving inconsistent metadata.  Run chkdsk.",
2232				err2);
2233		NVolSetErrors(idx_ni->vol);
2234	}
2235put_err:
2236	if (lock) {
2237		ictx->bmp_is_locked = 0;
2238		lck_rw_unlock_exclusive(&bmp_ni->lock);
2239	}
2240	(void)vnode_put(bmp_ni->vn);
2241err:
2242	ntfs_error(idx_ni->vol->mp, "Failed (error %d).", err);
2243	return err;
2244}
2245
2246/**
2247 * ntfs_index_ctx_disconnect_reinit - disconnect and reinit an index context
2248 * @ictx:	index context to disconnect
2249 *
2250 * Disconnect the index context @ictx from its tree path and reinitialize its
2251 * @up and @down pointers to point to itself.
2252 *
2253 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode.
2254 */
2255static inline void ntfs_index_ctx_disconnect_reinit(ntfs_index_context *ictx)
2256{
2257	ntfs_index_ctx_disconnect(ictx);
2258	ictx->down = ictx->up = ictx;
2259}
2260
2261/**
2262 * ntfs_index_move_root_to_allocation_block - move index root to allocation
2263 * @ictx:	index lookup context describing index root to move
2264 *
2265 * Move the index root to an index allocation block in the process switching
2266 * the index from a small to a large index if necessary.
2267 *
2268 * If no index allocation and/or bitmap attributes exist they are created.
2269 *
2270 * An index block is then allocated and if necessary the index bitmap and/or
2271 * the allocation attributes are extended in the process.
2272 *
2273 * Finally all index entries contained in the index root attribute are moved
2274 * into the allocated index block in the index allocation attribute.
2275 *
2276 * We need to potentially create the index allocation and/or bitmap attributes
2277 * so we can move the entries from the index root attribute to the index
2278 * allocation attribute and then shrink the index root attribute.  However,
2279 * there is not enough space in the mft record to do this.  Also we already
2280 * have the index root attribute looked up so it makes sense to deal with it
2281 * first.
2282 *
2283 * Thus, if there is no index allocation attribute, we allocate the space to be
2284 * used by the index allocation attribute and setup the directory inode for a
2285 * large index including the allocated space but leaving the initialized size
2286 * to zero.  We then map and lock the first page containing the now allocated
2287 * first index block and move the index entries from the index root into it.
2288 * We then shrink the index root and set it up to point to the allocated index
2289 * block.
2290 *
2291 * Having shrunk the index root attribute there is now hopefully enough space
2292 * in the mft record to create the index allocation attribute and the index
2293 * bitmap attribute in the mft record and the conversion is complete.
2294 *
2295 * If there is not enough space we create the index allocation and/or index
2296 * bitmap attributes in an extent mft record.  TODO: This is not implemented
2297 * yet.
2298 *
2299 * If there is an index allocation attribute already, we allocate a temporary
2300 * buffer to hold the index block and then copy from there into the allocated
2301 * index block later.
2302 *
2303 * Return 0 on success and errno on error.  On error the index context is no
2304 * longer usable and must be released or reinitialized.
2305 *
2306 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for
2307 *	      writing.
2308 *	    - The index context @ictx must be locked.
2309 */
2310errno_t ntfs_index_move_root_to_allocation_block(ntfs_index_context *ictx)
2311{
2312	VCN vcn;
2313	ntfs_inode *base_ni, *idx_ni;
2314	ntfs_volume *vol;
2315	ntfs_index_context *ir_ictx;
2316	upl_t upl;
2317	upl_page_info_array_t pl;
2318	INDEX_ALLOCATION *ia;
2319	INDEX_HEADER *ih;
2320	INDEX_ROOT *ir;
2321	INDEX_ENTRY *ie;
2322	ntfs_attr_search_ctx *actx;
2323	MFT_RECORD *m;
2324	u32 u, ia_ie_ofs, ir_ie_ofs, clusters = 0;
2325	errno_t err, err2;
2326	struct {
2327		unsigned is_large_index:1;
2328	} old;
2329	BOOL need_ubc_setsize = FALSE;
2330
2331	ntfs_debug("Entering.");
2332	if (!ictx->is_locked)
2333		panic("%s(): !ictx->is_locked\n", __FUNCTION__);
2334	base_ni = ictx->base_ni;
2335	idx_ni = ictx->idx_ni;
2336	vol = base_ni->vol;
2337	/*
2338	 * The current index context is going to turn from describing the index
2339	 * root to describing the newly allocated index block.  Thus, allocate
2340	 * a new index context for the emptied index root.
2341	 */
2342	ir_ictx = ntfs_index_ctx_get(idx_ni);
2343	if (!ir_ictx)
2344		return ENOMEM;
2345	ir_ictx->entries = OSMalloc(ictx->max_entries * sizeof(INDEX_ENTRY*),
2346			ntfs_malloc_tag);
2347	if (!ir_ictx->entries) {
2348		err = ENOMEM;
2349		goto err;
2350	}
2351	/*
2352	 * If there is no index allocation attribute, we need to allocate an
2353	 * index block worth of clusters.  Thus if the cluster size is less
2354	 * than the index block size the number of clusters we need to allocate
2355	 * is the index block size divided by the cluster size and if the
2356	 * cluster size is greater than or equal to the index block size we
2357	 * simply allocate one cluster.
2358	 *
2359	 * Otherwise we allocate a temporary buffer for the index block.
2360	 */
2361	if (!NInoIndexAllocPresent(idx_ni)) {
2362		clusters = idx_ni->block_size >> vol->cluster_size_shift;
2363		if (!clusters)
2364			clusters = 1;
2365		/*
2366		 * We cannot lock the runlist as we are holding the mft record
2367		 * lock.  That is fortunately ok as we also have @idx_ni->lock
2368		 * on the index inode and there is no index allocation
2369		 * attribute at present so no-one can be using the runlist yet.
2370		 */
2371		err = ntfs_cluster_alloc(vol, 0, clusters, -1, DATA_ZONE, TRUE,
2372				&idx_ni->rl);
2373		if (err) {
2374			ntfs_error(vol->mp, "Failed to allocate %u clusters "
2375					"for the index allocation block "
2376					"(error %d).", (unsigned)clusters,
2377					err);
2378			goto err;
2379		}
2380		/* Allocate/get the first page and zero it out. */
2381		ntfs_page_grab(idx_ni, 0, &upl, &pl, (u8**)&ia, TRUE);
2382		if (err) {
2383			ntfs_error(vol->mp, "Failed to grab first page.");
2384			goto page_err;
2385		}
2386		bzero(ia, PAGE_SIZE);
2387	} else {
2388		/* Allocate a temporary buffer and zero it out. */
2389		ia = OSMalloc(idx_ni->block_size, ntfs_malloc_tag);
2390		if (!ia) {
2391			err = ENOMEM;
2392			goto err;
2393		}
2394		bzero(ia, idx_ni->block_size);
2395		upl = NULL;
2396		pl = NULL;
2397	}
2398	/*
2399	 * Set up the index block and copy the index entries from the index
2400	 * root.
2401	 */
2402	ia->magic = magic_INDX;
2403	ia->usa_ofs = const_cpu_to_le16(sizeof(INDEX_BLOCK));
2404	ia->usa_count = cpu_to_le16(1 + (idx_ni->block_size / NTFS_BLOCK_SIZE));
2405	/* Set the update sequence number to 1. */
2406	*(le16*)((u8*)ia + le16_to_cpu(ia->usa_ofs)) = cpu_to_le16(1);
2407	ih = &ia->index;
2408	ia_ie_ofs = (sizeof(INDEX_HEADER) +
2409			(le16_to_cpu(ia->usa_count) << 1) + 7) & ~7;
2410	ih->entries_offset = cpu_to_le32(ia_ie_ofs);
2411	/* Work out the size of the index entries in the index root. */
2412	ir = ictx->ir;
2413	ir_ie_ofs = le32_to_cpu(ir->index.entries_offset);
2414	/*
2415	 * FIXME: Should we scan through the index root looking for the last
2416	 * entry and use that to determine the size instead?  Then we could fix
2417	 * any space being wasted due to a too long index for its entries.
2418	 */
2419	u = le32_to_cpu(ir->index.index_length) - ir_ie_ofs;
2420	ih->index_length = cpu_to_le32(ia_ie_ofs + u);
2421	ih->allocated_size = cpu_to_le32(idx_ni->block_size -
2422			offsetof(INDEX_BLOCK, index));
2423	ih->flags = LEAF_NODE;
2424	ie = (INDEX_ENTRY*)((u8*)&ir->index + ir_ie_ofs);
2425	memcpy((u8*)ih + ia_ie_ofs, ie, u);
2426	/*
2427	 * If the index root is a large index, then the index block is an index
2428	 * node, i.e. not a leaf node.
2429	 */
2430	if (ir->index.flags & LARGE_INDEX) {
2431		old.is_large_index = 1;
2432		ih->flags |= INDEX_NODE;
2433	}
2434	/*
2435	 * Set up the index context to point into the index allocation block
2436	 * instead of into the index root.
2437	 */
2438	ictx->entry = (INDEX_ENTRY*)((u8*)ih + ia_ie_ofs +
2439			((u8*)ictx->entry - (u8*)ie));
2440	ictx->is_root = 0;
2441	actx = ictx->actx;
2442	ictx->ia = ia;
2443	ictx->vcn = 0;
2444	ictx->index = ih;
2445	ictx->upl_ofs = 0;
2446	ictx->upl = upl;
2447	ictx->pl = pl;
2448	ictx->addr = (u8*)ia;
2449	ictx->bytes_free = le32_to_cpu(ia->index.allocated_size) -
2450			le32_to_cpu(ia->index.index_length);
2451	/*
2452	 * We have copied the index entries and switched the index context so
2453	 * we can go ahead and shrink the index root by moving the end entry on
2454	 * top of the first entry.  We only need to do the move if the first
2455	 * index root entry is not the end entry, i.e. the index is not empty.
2456	 */
2457	ih = &ir->index;
2458	if (!(ie->flags & INDEX_ENTRY_END)) {
2459		u8 *index_end;
2460		INDEX_ENTRY *end_ie;
2461
2462		index_end = (u8*)&ir->index + le32_to_cpu(ih->index_length);
2463		for (end_ie = ie;; end_ie = (INDEX_ENTRY*)((u8*)end_ie +
2464				le16_to_cpu(end_ie->length))) {
2465			/* Bounds checks. */
2466			if ((u8*)end_ie < (u8*)ie || (u8*)end_ie +
2467					sizeof(INDEX_ENTRY_HEADER) >
2468					index_end || (u8*)end_ie +
2469					le16_to_cpu(end_ie->length) >
2470					index_end ||
2471					(u32)sizeof(INDEX_ENTRY_HEADER) +
2472					le16_to_cpu(end_ie->key_length) >
2473					le16_to_cpu(end_ie->length)) {
2474				ntfs_error(vol->mp, "Corrupt index.  Run "
2475						"chkdsk.");
2476				NVolSetErrors(vol);
2477				err = EIO;
2478				goto ictx_err;
2479			}
2480			/* Stop when we have found the last entry. */
2481			if (end_ie->flags & INDEX_ENTRY_END)
2482				break;
2483		}
2484		memmove(ie, end_ie, le16_to_cpu(end_ie->length));
2485	}
2486	/*
2487	 * If the end entry is a leaf we need to extend it by 8 bytes in order
2488	 * to turn it into a node entry.  To do this we need to extend the
2489	 * index root attribute itself in the case that the index root was
2490	 * empty to start with or we need to do nothing if the index root was
2491	 * not empty as we have not shrunk the index root attribute yet and we
2492	 * moved the end index entry forward by at least one index entry which
2493	 * is much bigger than 8 bytes.
2494	 *
2495	 * We do the index root attribute resize unconditionally because it
2496	 * takes care of the size increase needed in the empty index root case
2497	 * as well as of the size decrease needed in the non-empty index root
2498	 * case.
2499	 */
2500retry_resize:
2501	u = le16_to_cpu(ie->length);
2502	if (!(ie->flags & INDEX_ENTRY_NODE))
2503		u += sizeof(VCN);
2504	err = ntfs_resident_attr_value_resize(actx->m, actx->a,
2505			offsetof(INDEX_ROOT, index) +
2506			le32_to_cpu(ih->entries_offset) + u);
2507	if (err) {
2508		leMFT_REF mref;
2509		ATTR_RECORD *a;
2510		ntfschar *a_name;
2511		ATTR_LIST_ENTRY *al_entry;
2512		u8 *al_end;
2513		ntfs_attr_search_ctx ctx;
2514
2515		/*
2516		 * This can only happen when the first entry is the last entry,
2517		 * i.e. this is an empty directory and thus the above memmove()
2518		 * has not been done, in which case we only need eight bytes
2519		 * (sizeof(VCN)) more space which means that if we manage to
2520		 * move out a single attribute out of the mft record we would
2521		 * gain that much space.  In the worst case scenario we can
2522		 * always move out the index root attribute itself in which
2523		 * case we are guaranteed to have enough space as an empty
2524		 * index root is much smaller than an mft record.  The only
2525		 * complication is when there is no attribute list attribute as
2526		 * we have to add it first.  On the bright side that in itself
2527		 * can cause some space to be freed up an we may have enough to
2528		 * extend the index root by eight bytes.
2529		 *
2530		 * Add an attribute list attribute if it is not present.
2531		 */
2532		if (!NInoAttrList(base_ni)) {
2533			/*
2534			 * Take a copy of the current attribute record pointer
2535			 * so we can update all our pointers below.
2536			 */
2537			a = actx->a;
2538			err = ntfs_attr_list_add(base_ni, actx->m, actx);
2539			if (err || actx->is_error) {
2540				if (!err)
2541					err = actx->error;
2542				ntfs_error(vol->mp, "Failed to add attribute "
2543						"list attribute to mft_no "
2544						"0x%llx (error %d).",
2545						(unsigned long long)
2546						base_ni->mft_no, err);
2547				goto ictx_err;
2548			}
2549update_and_retry_resize:
2550			/*
2551			 * Need to update our cached pointers as the index root
2552			 * attribute is likely to have moved.
2553			 */
2554			ir = (INDEX_ROOT*)((u8*)actx->a +
2555					le16_to_cpu(actx->a->value_offset));
2556			ih = &ir->index;
2557			ir_ie_ofs = le32_to_cpu(ir->index.entries_offset);
2558			ie = (INDEX_ENTRY*)((u8*)&ir->index + ir_ie_ofs);
2559			/*
2560			 * Retry the resize in case we freed eight bytes in the
2561			 * mft record which is all we need.
2562			 */
2563			goto retry_resize;
2564		}
2565		/*
2566		 * If this is the only attribute record in the mft record we
2567		 * cannot gain anything by moving it or anything else.  This
2568		 * really cannot happen as we have emptied the index root thus
2569		 * have made the attribute as small as possible thus it must
2570		 * fit just fine in an otherwise empty mft record thus we must
2571		 * have enough space to do the resize thus we cannot have
2572		 * gotten here.
2573		 */
2574		if (ntfs_attr_record_is_only_one(actx->m, actx->a))
2575			panic("%s(): ntfs_attr_record_is_only_one()\n",
2576					__FUNCTION__);
2577		/*
2578		 * We now know we have an attribute list attribute and that we
2579		 * still do not have enough space to extend the index root
2580		 * attribute by eight bytes.
2581		 *
2582		 * First, if the index root attribute is already in an extent
2583		 * mft record move it into a new, empty extent mft record.
2584		 * This is guaranteed to give us the needed eight bytes.
2585		 *
2586		 * Second, look through the mft record to see if there are any
2587		 * attribute records we can move out into an extent mft record
2588		 * and if yes move one.  That will also definitely give us the
2589		 * needed eight bytes of space.
2590		 *
2591		 * Finally, if no attributes are available for moving then move
2592		 * the index root attribute itself.
2593		 */
2594		if (actx->ni != base_ni) {
2595move_idx_root:
2596			lck_rw_lock_shared(&base_ni->attr_list_rl.lock);
2597			err = ntfs_attr_record_move(actx);
2598			lck_rw_unlock_shared(&base_ni->attr_list_rl.lock);
2599			if (err) {
2600				ntfs_error(vol->mp, "Failed to move index "
2601						"root attribute from mft "
2602						"record 0x%llx to an extent "
2603						"mft record (error %d).",
2604						(unsigned long long)
2605						actx->ni->mft_no, err);
2606				goto ictx_err;
2607			}
2608			/*
2609			 * Need to update our cached pointers as the index root
2610			 * attribute has now moved after which we need to retry
2611			 * the resize which will now succeed.
2612			 */
2613			goto update_and_retry_resize;
2614		}
2615		/*
2616		 * We know the index root is in the base mft record.
2617		 *
2618		 * Look through the base mft record to see if there are any
2619		 * attribute records we can move out into an extent mft record
2620		 * and if yes move one.  That will also definitely give us the
2621		 * needed eight bytes of space.
2622		 */
2623		ntfs_attr_search_ctx_init(&ctx, base_ni, actx->m);
2624		ctx.is_iteration = 1;
2625		do {
2626			err = ntfs_attr_find_in_mft_record(AT_UNUSED, NULL, 0,
2627					NULL, 0, &ctx);
2628			if (err) {
2629				if (err == ENOENT) {
2630					/*
2631					 * No attributes are available for
2632					 * moving thus move the index root
2633					 * attribute out of the base mft record
2634					 * into a new extent.
2635					 *
2636					 * TODO: Need to get this case
2637					 * triggered and then need to run
2638					 * chkdsk to check for validity of
2639					 * moving the index root attribute out
2640					 * of the base mft record.
2641					 */
2642					goto move_idx_root;
2643				}
2644				ntfs_error(vol->mp, "Failed to iterate over "
2645						"attribute records in base "
2646						"mft record 0x%llx (error %d).",
2647						(unsigned long long)
2648						base_ni->mft_no, err);
2649				goto ictx_err;
2650			}
2651			/*
2652			 * Skip the standard information attribute, the
2653			 * attribute list attribute, and the index root
2654			 * attribute as we are looking for lower priority
2655			 * attributes to move out and the attribute list
2656			 * attribute is of course not movable.
2657			 */
2658			a = ctx.a;
2659		} while (a->type == AT_STANDARD_INFORMATION ||
2660				a->type == AT_ATTRIBUTE_LIST ||
2661				a->type == AT_INDEX_ROOT);
2662		/*
2663		 * Move the found attribute out to an extent mft record and
2664		 * update its attribute list entry.
2665		 *
2666		 * But first find the attribute list entry matching the
2667		 * attribute record so it can be updated.
2668		 */
2669		a_name = (ntfschar*)((u8*)a + le16_to_cpu(a->name_offset));
2670		al_entry = (ATTR_LIST_ENTRY*)base_ni->attr_list;
2671		al_end = (u8*)al_entry + base_ni->attr_list_size;
2672		mref = MK_LE_MREF(base_ni->mft_no, base_ni->seq_no);
2673		do {
2674			if ((u8*)al_entry >= al_end || !al_entry->length) {
2675				ntfs_error(vol->mp, "Attribute list attribute "
2676						"in mft_no 0x%llx is "
2677						"corrupt.  Run chkdsk.",
2678						(unsigned long long)
2679						base_ni->mft_no);
2680				err = EIO;
2681				goto ictx_err;
2682			}
2683			if (al_entry->mft_reference == mref &&
2684					al_entry->instance == a->instance) {
2685				/*
2686				 * We found the entry, stop looking but first
2687				 * perform a quick sanity check that we really
2688				 * do have the correct attribute record.
2689				 */
2690				if (al_entry->type == a->type &&
2691						ntfs_are_names_equal(
2692						(ntfschar*)((u8*)al_entry +
2693						al_entry->name_offset),
2694						al_entry->name_length, a_name,
2695						a->name_length, TRUE,
2696						vol->upcase, vol->upcase_len))
2697					break;
2698				ntfs_error(vol->mp, "Found corrupt attribute "
2699						"list attribute when looking "
2700						"for attribute type 0x%x in "
2701						"attribute list attribute of "
2702						"base mft record 0x%llx.  Run "
2703						"chkdsk.",
2704						(unsigned)le32_to_cpu(a->type),
2705						(unsigned long long)
2706						base_ni->mft_no);
2707				NVolSetErrors(vol);
2708				err = EIO;
2709				goto ictx_err;
2710			}
2711			/* Go to the next attribute list entry. */
2712			al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
2713					le16_to_cpu(al_entry->length));
2714		} while (1);
2715		/* Finally, move the attribute to an extent record. */
2716		err = ntfs_attr_record_move_for_attr_list_attribute(&ctx,
2717				al_entry, actx, NULL);
2718		if (err) {
2719			ntfs_error(vol->mp, "Failed to move attribute type "
2720					"0x%x out of base mft record 0x%llx "
2721					"and into an extent mft record (error "
2722					"%d).  Run chkdsk.",
2723					(unsigned)le32_to_cpu(a->type),
2724					(unsigned long long)base_ni->mft_no,
2725					err);
2726			NVolSetErrors(vol);
2727			goto ictx_err;
2728		}
2729		/*
2730		 * Sync the modified attribute list attribute value to
2731		 * metadata/disk.
2732		 */
2733		ntfs_attr_search_ctx_reinit(&ctx);
2734		err = ntfs_attr_list_sync(base_ni, (u8*)al_entry -
2735				base_ni->attr_list, &ctx);
2736		if (!err) {
2737			/*
2738			 * Need to update our cached pointers as the index root
2739			 * attribute has likely moved after which we need to
2740			 * retry the resize which will now succeed.
2741			 */
2742			goto update_and_retry_resize;
2743		}
2744		/*
2745		 * FIXME: We could try and revert the move of the attribute and
2746		 * the attribute list attribute value change but that is a lot
2747		 * of work and the only real errors that could happen at this
2748		 * stage are either that we have run out of memory or that the
2749		 * disk has gone bad or has been disconnected or there must be
2750		 * bad memory corruption.  In all those cases we are extremely
2751		 * unlikely to be able to undo what we have done so far due to
2752		 * further errors so at least for now we do not bother trying.
2753		 */
2754		ntfs_error(vol->mp, "Failed to sync attribute list attribute "
2755				"in mft_no 0x%llx (error %d).  Leaving "
2756				"corrupt metadata.  Run chkdsk.",
2757				(unsigned long long)base_ni->mft_no, err);
2758		/*
2759		 * Need to update our cached pointers as the index root
2760		 * attribute has likely moved.
2761		 */
2762		ir = (INDEX_ROOT*)((u8*)actx->a +
2763				le16_to_cpu(actx->a->value_offset));
2764		ih = &ir->index;
2765		ir_ie_ofs = le32_to_cpu(ir->index.entries_offset);
2766		ie = (INDEX_ENTRY*)((u8*)&ir->index + ir_ie_ofs);
2767		goto ictx_err;
2768	}
2769	/*
2770	 * Update the end index entry and the index header to reflect the
2771	 * changes in size.
2772	 */
2773	ie->length = cpu_to_le16(u);
2774	ih->allocated_size = ih->index_length = cpu_to_le32(
2775			le32_to_cpu(ih->entries_offset) + u);
2776	/*
2777	 * Update the index root and end index entry to reflect the fact that
2778	 * the directory now is a large index and that the end index entry
2779	 * points to a sub-node and set the sub-node pointer to vcn 0.
2780	 */
2781	ih->flags |= LARGE_INDEX;
2782	ie->flags |= INDEX_ENTRY_NODE;
2783	*(leVCN*)((u8*)ie + le16_to_cpu(ie->length) - sizeof(VCN)) = 0;
2784	/*
2785	 * Now setup the new index context for the emptied index root and
2786	 * attach it at the start of the tree path.
2787	 */
2788	ir_ictx->entries[0] = ir_ictx->entry = ie;
2789	ir_ictx->is_root = 1;
2790	ir_ictx->is_locked = 1;
2791	ir_ictx->ir = ir;
2792	ir_ictx->index = ih;
2793	ir_ictx->actx = actx;
2794	ir_ictx->bytes_free = le32_to_cpu(actx->m->bytes_allocated) -
2795			le32_to_cpu(actx->m->bytes_in_use);
2796	ir_ictx->max_entries = ictx->max_entries;
2797	ir_ictx->nr_entries = 1;
2798	/* Ensure the modified index root attribute is written to disk. */
2799	ntfs_index_entry_mark_dirty(ir_ictx);
2800	/*
2801	 * Attach the new index context between the current index context
2802	 * (which is the top of the tree) and the index context above it (which
2803	 * is the bottom of the tree), i.e. make the new index context the top
2804	 * of the tree.
2805	 */
2806	ictx->up->down = ir_ictx;
2807	ir_ictx->down = ictx;
2808	ir_ictx->up = ictx->up;
2809	ictx->up = ir_ictx;
2810	/*
2811	 * If the index allocation attribute is not present yet, we need to
2812	 * create it and the index bitmap attribute.
2813	 */
2814	if (upl) {
2815		/*
2816		 * The page is now done, mark the index context as dirty so it
2817		 * gets written out later.
2818		 */
2819		ntfs_index_entry_mark_dirty(ictx);
2820		/*
2821		 * Set up the index inode to reflect that it has an index
2822		 * allocation attribute.
2823		 */
2824		NInoSetIndexAllocPresent(idx_ni);
2825		lck_spin_lock(&idx_ni->size_lock);
2826		idx_ni->allocated_size = (s64)clusters <<
2827				vol->cluster_size_shift;
2828		idx_ni->initialized_size = idx_ni->data_size =
2829				idx_ni->block_size;
2830		lck_spin_unlock(&idx_ni->size_lock);
2831		if (idx_ni->name == I30) {
2832			lck_spin_lock(&base_ni->size_lock);
2833			base_ni->allocated_size = idx_ni->allocated_size;
2834			base_ni->initialized_size = base_ni->data_size =
2835					idx_ni->block_size;
2836			lck_spin_unlock(&base_ni->size_lock);
2837		}
2838		if (!ubc_setsize(idx_ni->vn, idx_ni->data_size))
2839			panic("%s(): ubc_setsize() failed.\n", __FUNCTION__);
2840		/*
2841		 * Find the position at which we need to insert the index
2842		 * allocation attribute.
2843		 */
2844		err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, idx_ni->name,
2845				idx_ni->name_len, 0, NULL, 0, actx);
2846		if (err != ENOENT) {
2847			if (!err) {
2848				ntfs_error(vol->mp, "Index allocation "
2849						"attribute already present "
2850						"when it should not be.  "
2851						"Corrupt index or driver "
2852						"bug.  Run chkdsk.");
2853				NVolSetErrors(vol);
2854				err = EIO;
2855			} else
2856				ntfs_error(vol->mp, "Failed to look up "
2857						"position at which to insert "
2858						"the index allocation "
2859						"attribute (error %d).", err);
2860			goto ia_err;
2861		}
2862		/* Insert the index allocation attribute. */
2863		err = ntfs_insert_index_allocation_attribute(base_ni, actx,
2864				idx_ni);
2865		if (err || actx->is_error) {
2866			ntfs_error(vol->mp, "Failed to %s mft_no 0x%llx "
2867					"(error %d).", actx->is_error ?
2868					"remap extent mft record of" :
2869					"insert index allocation attribute in ",
2870					(unsigned long long)base_ni->mft_no,
2871					err);
2872			goto ia_err;
2873		}
2874		/*
2875		 * Ensure the created index allocation attribute is written to
2876		 * disk.
2877		 */
2878		NInoSetMrecNeedsDirtying(actx->ni);
2879		/*
2880		 * Find the position at which we need to insert the index
2881		 * bitmap attribute.
2882		 */
2883		ntfs_attr_search_ctx_reinit(actx);
2884		err = ntfs_attr_lookup(AT_BITMAP, idx_ni->name,
2885				idx_ni->name_len, 0, NULL, 0, actx);
2886		if (err != ENOENT) {
2887			if (!err) {
2888				ntfs_error(vol->mp, "Index bitmap attribute "
2889						"attribute already present "
2890						"when it should not be.  "
2891						"Corrupt index or driver "
2892						"bug.  Run chkdsk.");
2893				NVolSetErrors(vol);
2894				err = EIO;
2895			} else
2896				ntfs_error(vol->mp, "Failed to look up "
2897						"position at which to insert "
2898						"the index bitmap attribute "
2899						"(error %d).", err);
2900			goto bmp_err;
2901		}
2902		/*
2903		 * Insert the index bitmap attribute as a resident attribute
2904		 * with a value length sufficient to cover the number of
2905		 * allocated index blocks rounded up to a multiple of 8 bytes
2906		 * which are initialized to zero.  We then set the first bit in
2907		 * the bitmap to reflect the fact that the first index
2908		 * allocation block is in use.
2909		 */
2910		err = ntfs_resident_attr_record_insert(base_ni, actx,
2911				AT_BITMAP, idx_ni->name, idx_ni->name_len,
2912				NULL, (((idx_ni->allocated_size >>
2913				idx_ni->block_size_shift) + 63) >> 3) &
2914				~(s64)7);
2915		if (err || actx->is_error) {
2916			if (!err)
2917				err = actx->error;
2918			ntfs_error(vol->mp, "Failed to %s mft_no 0x%llx "
2919					"(error %d).", actx->is_error ?
2920					"remap extent mft record of" :
2921					"insert index bitmap attribute in",
2922					(unsigned long long)base_ni->mft_no,
2923					err);
2924			goto bmp_err;
2925		}
2926		*((u8*)actx->a + le16_to_cpu(actx->a->value_offset)) = 1;
2927		/*
2928		 * Ensure the created index bitmap attribute is written to
2929		 * disk.
2930		 */
2931		NInoSetMrecNeedsDirtying(actx->ni);
2932		/*
2933		 * We successfully created the index allocation and bitmap
2934		 * attributes thus we can set up our cache of the last set bit
2935		 * in the bitmap in the index inode to 0 to reflect the fact
2936		 * that we just allocated the first index block.
2937		 */
2938		idx_ni->last_set_bit = 0;
2939	}
2940	/*
2941	 * We are either completely done and can release the attribute search
2942	 * context and the mft record or we now need to call functions which
2943	 * will deadlock with us if we do not release the attribute search
2944	 * context and the mft record so we have to release them now thus we
2945	 * now unlock the index root context.
2946	 */
2947	ntfs_index_ctx_unlock(ir_ictx);
2948	if (upl) {
2949		INDEX_ENTRY **entries;
2950		long delta;
2951		unsigned i;
2952
2953update_ie_pointers:
2954		/*
2955		 * Note we still hold the locked and referenced index
2956		 * allocation page which is now attached to the index context
2957		 * and will be released later when the index context is
2958		 * released.
2959		 *
2960		 * We need to update the index entry pointers in the array to
2961		 * point into the index block instead of the old index root.
2962		 */
2963		entries = ictx->entries;
2964		delta = (u8*)ictx->entry - (u8*)entries[ictx->entry_nr];
2965		for (i = 0; i < ictx->nr_entries; i++)
2966			entries[i] = (INDEX_ENTRY*)((u8*)entries[i] + delta);
2967		if (ictx->entry != entries[ictx->entry_nr])
2968			panic("%s(): ictx->entry != entries[ictx->entry_nr]\n",
2969					__FUNCTION__);
2970		ntfs_debug("Done.");
2971		return 0;
2972	}
2973	/*
2974	 * We have an index allocation attribute already thus we need to
2975	 * allocate an index block from it for us to transfer the temporary
2976	 * index block to.
2977	 *
2978	 * We need to set @is_locked to zero because we are using the temporary
2979	 * buffer for @ia and hence @upl and @pl are zero thus as far as the
2980	 * index context code is concerned the context is not actually locked
2981	 * at all.  Not doing this can lead to a panic() in
2982	 * ntfs_index_block_alloc() under some circumstances due to an
2983	 * assertion check that verifies that @is_locked is zero in
2984	 * ntfs_attr_resize().
2985	 */
2986	ictx->is_locked = 0;
2987	err = ntfs_index_block_alloc(ictx, &vcn, &ia, &ictx->upl_ofs, &upl,
2988			&ictx->pl, &ictx->addr);
2989	if (err) {
2990		ntfs_error(vol->mp, "Failed to allocate index allocation "
2991				"block (error %d).", err);
2992		goto alloc_err;
2993	}
2994	/* Copy the update sequence number into our temporary buffer. */
2995	*(le16*)((u8*)ictx->ia + le16_to_cpu(ictx->ia->usa_ofs)) =
2996			*(le16*)((u8*)ia + le16_to_cpu(ia->usa_ofs));
2997	/*
2998	 * Copy our temporary buffer into the allocated index block, free the
2999	 * temporary buffer and setup the index context to point to the
3000	 * allocated index block instead of the temporary one.
3001	 */
3002	memcpy(ia, ictx->ia, idx_ni->block_size);
3003	OSFree(ictx->ia, idx_ni->block_size, ntfs_malloc_tag);
3004	ictx->entry = ie = (INDEX_ENTRY*)((u8*)ia +
3005			((u8*)ictx->entry - (u8*)ictx->ia));
3006	ictx->ia = ia;
3007	ictx->index = &ia->index;
3008	ictx->upl = upl;
3009	ictx->is_locked = 1;
3010	/*
3011	 * If the vcn of the allocated index block is not zero, we need to
3012	 * update the vcn in the index block itself as well as the sub-node
3013	 * vcn pointer in the index root attribute.
3014	 */
3015	if (vcn) {
3016		ictx->vcn = vcn;
3017		ia->index_block_vcn = cpu_to_sle64(vcn);
3018		ictx->upl_ofs = (vcn << idx_ni->vcn_size_shift) &
3019				~PAGE_MASK_64;
3020		/* Get hold of the mft record for the index inode. */
3021		err = ntfs_mft_record_map(base_ni, &m);
3022		if (err) {
3023			/*
3024			 * The only thing that is now incorrect is the vcn
3025			 * sub-node pointer in the empty index root attribute
3026			 * but we cannot correct it as we are failing to map
3027			 * the mft record which we need to be able to rollback.
3028			 * We leave it to chkdsk to sort this out later.
3029			 */
3030			ntfs_error(vol->mp, "Cannot rollback partial index "
3031					"upgrade because "
3032					"ntfs_mft_record_map() failed (error "
3033					"%d).  Leaving inconsistent "
3034					"metadata.  Run chkdsk.", err);
3035			goto map_err;
3036		}
3037		actx = ntfs_attr_search_ctx_get(base_ni, m);
3038		if (!actx) {
3039			err = ENOMEM;
3040			ntfs_error(vol->mp, "Cannot rollback partial index "
3041					"upgrade because there was not enough "
3042					"memory to obtain an attribute search "
3043					"context.  Leaving inconsistent "
3044					"metadata.  Run chkdsk.");
3045			goto actx_err;
3046		}
3047		/* Find the index root attribute in the mft record. */
3048		err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name,
3049				idx_ni->name_len, 0, NULL, 0, actx);
3050		if (err)
3051			goto lookup_err;
3052		/* Get to the index root value. */
3053		ir = (INDEX_ROOT*)((u8*)actx->a +
3054				le16_to_cpu(actx->a->value_offset));
3055		/* The first index entry which is also the last one. */
3056		ie = (INDEX_ENTRY*)((u8*)&ir->index +
3057				le32_to_cpu(ir->index.entries_offset));
3058		if (!(ie->flags & INDEX_ENTRY_END))
3059			panic("%s(): !(ie->flags & INDEX_ENTRY_END)\n",
3060					__FUNCTION__);
3061		if (!(ie->flags & INDEX_ENTRY_NODE))
3062			panic("%s(): !(ie->flags & INDEX_ENTRY_NODE)\n",
3063					__FUNCTION__);
3064		/* Finally, update the vcn pointer of the index entry. */
3065		*(leVCN*)((u8*)ie + le16_to_cpu(ie->length) - sizeof(VCN)) =
3066				cpu_to_sle64(vcn);
3067		/*
3068		 * Ensure the updated index entry is written to disk and
3069		 * release the attribute search context and the mft record.
3070		 */
3071		NInoSetMrecNeedsDirtying(actx->ni);
3072		ntfs_attr_search_ctx_put(actx);
3073		ntfs_mft_record_unmap(base_ni);
3074	}
3075	/*
3076	 * The page is now done, mark the index context as dirty so it gets
3077	 * written out later.
3078	 */
3079	ntfs_index_entry_mark_dirty(ictx);
3080	goto update_ie_pointers;
3081lookup_err:
3082	ntfs_error(vol->mp, "Cannot rollback partial index upgrade because we "
3083			"failed to look up the index root attribute (error "
3084			"%d).  Leaving inconsistent metadata.  Run chkdsk.",
3085			err);
3086	if (err == ENOENT)
3087		err = EIO;
3088	ntfs_attr_search_ctx_put(actx);
3089actx_err:
3090	ntfs_mft_record_unmap(base_ni);
3091map_err:
3092	/*
3093	 * The page is now done, mark the index context as dirty so it gets
3094	 * written out later.
3095	 */
3096	ntfs_index_entry_mark_dirty(ictx);
3097	goto err;
3098alloc_err:
3099	/*
3100	 * We need to get hold of the mft record and get a new search context
3101	 * so we can restore the index root attribute.
3102	 */
3103	err2 = ntfs_mft_record_map(base_ni, &m);
3104	if (err2) {
3105		ntfs_error(vol->mp, "Cannot rollback partial index upgrade "
3106				"because ntfs_mft_record_map() failed (error "
3107				"%d).  Leaving inconsistent metadata.  Run "
3108				"chkdsk.", err2);
3109undo_alloc_err2:
3110		/*
3111		 * Mark the index context invalid given it neither has an index
3112		 * block and page nor an index root and mft record attached.
3113		 */
3114		ictx->entry = NULL;
3115		goto undo_alloc_err;
3116	}
3117	actx = ntfs_attr_search_ctx_get(base_ni, m);
3118	if (!actx) {
3119		ntfs_error(vol->mp, "Cannot rollback partial index upgrade "
3120				"because there was not enough memory to "
3121				"obtain an attribute search context.  Leaving "
3122				"inconsistent metadata.  Run chkdsk.");
3123		ntfs_mft_record_unmap(base_ni);
3124		goto undo_alloc_err2;
3125	}
3126	goto restore_ir;
3127bmp_err:
3128	ntfs_attr_search_ctx_reinit(actx);
3129	err2 = ntfs_attr_lookup(AT_INDEX_ALLOCATION, idx_ni->name,
3130			idx_ni->name_len, 0, NULL, 0, actx);
3131	if (err2) {
3132		ntfs_error(vol->mp, "Cannot rollback partial index upgrade "
3133				"because looking up the index allocation "
3134				"attribute failed (error %d).  Leaving "
3135				"inconsistent metadata.  Run chkdsk.", err2);
3136bmp_err_err:
3137		NVolSetErrors(vol);
3138		/*
3139		 * Everything is actually consistent except for the fact that
3140		 * the index bitmap attribute is missing so it is better if we
3141		 * do not do any kind of rollback.  Hopefully, chkdsk will fix
3142		 * this by adding the missing index bitmap attribute.
3143		 */
3144		goto err;
3145	}
3146	/* Remove the added index allocation attribute. */
3147	err2 = ntfs_attr_record_delete(base_ni, actx);
3148	if (err2) {
3149		ntfs_error(vol->mp, "Cannot rollback partial index upgrade "
3150				"because deleting the index allocation "
3151				"attribute failed (error %d).  Leaving "
3152				"inconsistent metadata.  Run chkdsk.", err2);
3153		goto bmp_err_err;
3154	}
3155ia_err:
3156	/* Reset the inode. */
3157	lck_spin_lock(&idx_ni->size_lock);
3158	idx_ni->initialized_size = idx_ni->data_size = idx_ni->allocated_size =
3159			0;
3160	lck_spin_unlock(&idx_ni->size_lock);
3161	if (idx_ni->name == I30) {
3162		lck_spin_lock(&base_ni->size_lock);
3163		base_ni->initialized_size = base_ni->data_size =
3164				base_ni->allocated_size = 0;
3165		lck_spin_unlock(&base_ni->size_lock);
3166	}
3167	NInoClearIndexAllocPresent(idx_ni);
3168	/*
3169	 * Cannot call ubc_setsize() because we have the page pinned so delay
3170	 * until we dump the page later on.
3171	 */
3172	need_ubc_setsize = TRUE;
3173	/*
3174	 * Pretend the @ir_ictx is not locked as we are going to transfer the
3175	 * index root back to @ictx.
3176	 */
3177	ir_ictx->is_locked = 0;
3178	ir_ictx->actx = NULL;
3179	/* Restore the index root attribute. */
3180	ntfs_attr_search_ctx_reinit(actx);
3181restore_ir:
3182	err2 = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len,
3183			0, NULL, 0, actx);
3184	if (err2) {
3185		ntfs_error(vol->mp, "Cannot rollback partial index upgrade "
3186				"because looking up the index root attribute "
3187				"failed (error %d).  Leaving inconsistent "
3188				"metadata.  Run chkdsk.", err2);
3189		NVolSetErrors(vol);
3190		goto ictx_err;
3191	}
3192	ir = (INDEX_ROOT*)((u8*)actx->a + le16_to_cpu(actx->a->value_offset));
3193	ih = &ir->index;
3194	ie = (INDEX_ENTRY*)((u8*)ih + ir_ie_ofs);
3195	u = ir_ie_ofs + (le32_to_cpu(ia->index.index_length) - ia_ie_ofs);
3196	err2 = ntfs_resident_attr_value_resize(actx->m, actx->a,
3197			offsetof(INDEX_ROOT, index) + u);
3198	if (err2) {
3199		ntfs_error(vol->mp, "Cannot rollback partial index upgrade "
3200				"because resizing the index root attribute "
3201				"failed (error %d).  Leaving inconsistent "
3202				"metadata.  Run chkdsk.", err2);
3203		NVolSetErrors(vol);
3204		goto ictx_err;
3205	}
3206	if (!old.is_large_index)
3207		ih->flags &= ~LARGE_INDEX;
3208	ih->allocated_size = ih->index_length = cpu_to_le32(u);
3209	memcpy(ie, (u8*)&ia->index + ia_ie_ofs,
3210			le32_to_cpu(ia->index.index_length) - ia_ie_ofs);
3211	/* Ensure the restored index root attribute is written to disk. */
3212	NInoSetMrecNeedsDirtying(actx->ni);
3213ictx_err:
3214	/*
3215	 * Reset the index context.  We may be setting @ictx->entry to a bogus
3216	 * value but it does not matter because we are returning an error code
3217	 * thus the caller must not use the index context and while the value
3218	 * may be bogus it is correctly non-NULL thus the index context will be
3219	 * cleaned up correctly when the caller releases it.
3220	 */
3221	ictx->entry = ictx->entries[ictx->entry_nr];
3222	ictx->is_root = 1;
3223	ictx->is_locked = 1;
3224	ictx->ir = ir;
3225	ictx->index = ih;
3226	ictx->actx = actx;
3227	if (!actx->is_error)
3228		ir_ictx->bytes_free = le32_to_cpu(actx->m->bytes_allocated) -
3229				le32_to_cpu(actx->m->bytes_in_use);
3230	if (!upl) {
3231undo_alloc_err:
3232		OSFree(ia, idx_ni->block_size, ntfs_malloc_tag);
3233	} else {
3234		/* Destroy the page. */
3235		ntfs_page_dump(idx_ni, upl, pl);
3236		if (need_ubc_setsize && !ubc_setsize(idx_ni->vn, 0))
3237			panic("%s(): ubc_setsize() failed.\n", __FUNCTION__);
3238page_err:
3239		err2 = ntfs_cluster_free_from_rl(vol, idx_ni->rl.rl, 0, -1,
3240				NULL);
3241		if (err2) {
3242			ntfs_error(vol->mp, "Failed to rollback cluster "
3243					"allocation (error %d).  Run chkdsk "
3244					"to recover the lost space.", err2);
3245			NVolSetErrors(vol);
3246		}
3247		err2 = ntfs_rl_truncate_nolock(vol, &idx_ni->rl, 0);
3248		if (err2)
3249			panic("%s(): err2\n", __FUNCTION__);
3250	}
3251err:
3252	/*
3253	 * Dissociate the allocated index root context from the tree path and
3254	 * throw it away.  We do this here unconditionally as it works both for
3255	 * the case where we have not attached it to the tree path yet and for
3256	 * the case where we have already attached it to the tree path.  We
3257	 * have to deal with the former case here so cannot defer the cleanup
3258	 * to the ntfs_index_ctx_put() of the index context @ictx that will be
3259	 * done by the caller.
3260	 */
3261	ntfs_index_ctx_disconnect_reinit(ir_ictx);
3262	ntfs_index_ctx_put(ir_ictx);
3263	ntfs_error(vol->mp, "Failed (error %d).", err);
3264	return err;
3265}
3266
3267/**
3268 * ntfs_index_root_make_space - make space for an index entry in the index root
3269 * @ictx:	index entry in front of which to make space
3270 * @ie_size:	size of the index entry to make space for
3271 *
3272 * Return 0 on success and ENOSPC if there is not enough space in the mft
3273 * record to insert an index entry of size @ie_size in the index root
3274 * attribute.
3275 *
3276 * Note that we do not update the array of index entry pointers nor the number
3277 * of entries in the array because on success it means that the index entry
3278 * being added will be created and the function is then done, i.e. the array of
3279 * index entry pointers will not be used any more and on error we have not done
3280 * anything at all so there is no need to update any of the pointers.
3281 *
3282 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for
3283 *	      writing.
3284 *	    - The index context @ictx must be locked.
3285 */
3286static errno_t ntfs_index_root_make_space(ntfs_index_context *ictx,
3287		const u32 ie_size)
3288{
3289	MFT_RECORD *m = ictx->actx->m;
3290	const u32 muse = le32_to_cpu(m->bytes_in_use);
3291	const u32 new_muse = muse + ie_size;
3292
3293	ntfs_debug("Entering.");
3294	if (new_muse <= le32_to_cpu(m->bytes_allocated)) {
3295		INDEX_ENTRY *ie = ictx->entry;
3296		ATTR_RECORD *a;
3297		INDEX_HEADER *ih;
3298
3299		/*
3300		 * Extend the index root attribute so it has enough space for
3301		 * the new index entry.  As an optimization we combine the
3302		 * resizing of the index root attribute and the moving of index
3303		 * entries within the attribute into a single operation.
3304		 */
3305		memmove((u8*)ie + ie_size, ie, muse - ((u8*)ie - (u8*)m));
3306		/* Adjust the mft record to reflect the change in used space. */
3307		m->bytes_in_use = cpu_to_le32(new_muse);
3308		/*
3309		 * Adjust the attribute record to reflect the changes in the
3310		 * size of the attribute record and in the size of the
3311		 * attribute value.
3312		 */
3313		a = ictx->actx->a;
3314		a->length = cpu_to_le32(le32_to_cpu(a->length) + ie_size);
3315		a->value_length = cpu_to_le32(le32_to_cpu(a->value_length) +
3316				ie_size);
3317		/* Adjust the index header to reflect the change in length. */
3318		ih = ictx->index;
3319		ih->allocated_size = ih->index_length = cpu_to_le32(
3320				le32_to_cpu(ih->index_length) + ie_size);
3321		ntfs_debug("Done.");
3322		return 0;
3323	}
3324	ntfs_debug("Failed (not enough space in mft record to insert index "
3325			"entry into index root attribute).");
3326	return ENOSPC;
3327}
3328
3329/**
3330 * ntfs_index_block_make_space - make space for an index entry in an index block
3331 * @ictx:	index entry in front of which to make space
3332 * @ie_size:	size of the index entry to make space for
3333 *
3334 * Return 0 on success and ENOSPC if there is not enough space in the index
3335 * block to insert an index entry of size @ie_size.
3336 *
3337 * Note that we do not update the array of index entry pointers nor the number
3338 * of entries in the array because on success it means that the index entry
3339 * being added will be created and the function is then done, i.e. the array of
3340 * index entry pointers will not be used any more and on error we have not done
3341 * anything at all so there is no need to update any of the pointers.
3342 *
3343 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for
3344 *	      writing.
3345 *	    - The index context @ictx must be locked.
3346 */
3347static errno_t ntfs_index_block_make_space(ntfs_index_context *ictx,
3348		const u32 ie_size)
3349{
3350	INDEX_HEADER *ih = ictx->index;
3351	const u32 ilen = le32_to_cpu(ih->index_length);
3352	const u32 new_ilen = ilen + ie_size;
3353
3354	ntfs_debug("Entering.");
3355	if (new_ilen <= le32_to_cpu(ih->allocated_size)) {
3356		INDEX_ENTRY *ie = ictx->entry;
3357
3358		/* Move the index entries to make space for the new entry. */
3359		memmove((u8*)ie + ie_size, ie, ilen - ((u8*)ie - (u8*)ih));
3360		/* Adjust the index header to reflect the change in length. */
3361		ih->index_length = cpu_to_le32(new_ilen);
3362		ntfs_debug("Done.");
3363		return 0;
3364	}
3365	ntfs_debug("Failed (not enough space in index block to insert index "
3366			"entry).");
3367	return ENOSPC;
3368}
3369
3370/**
3371 * ntfs_index_node_make_space - make space for an index entry in an index node
3372 * @ictx:	index entry in front of which to make space
3373 * @ie_size:	size of the index entry to make space for
3374 *
3375 * Return 0 on success and ENOSPC if there is not enough space in the index
3376 * node to insert an index entry of size @ie_size.
3377 *
3378 * Note that we do not update the array of index entry pointers nor the number
3379 * of entries in the array because on success it means that the index entry
3380 * being added will be created and the function is then done, i.e. the array of
3381 * index entry pointers will not be used any more and on error we have not done
3382 * anything at all so there is no need to update any of the pointers.
3383 *
3384 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for
3385 *	      writing.
3386 *	    - The index context @ictx must be locked.
3387 */
3388static errno_t ntfs_index_node_make_space(ntfs_index_context *ictx,
3389		const u32 ie_size)
3390{
3391	errno_t err;
3392
3393	if (ictx->is_root)
3394		err = ntfs_index_root_make_space(ictx, ie_size);
3395	else
3396		err = ntfs_index_block_make_space(ictx, ie_size);
3397	return err;
3398}
3399
3400/**
3401 * ntfs_index_ctx_revalidate - revalidate the pointers in an index context
3402 * @new:	index context with the new mapped virtual address
3403 * @old:	index context to revalidate
3404 *
3405 * Revalidate all pointers in the index context @old and adjust them for the
3406 * changed mapped virtual address in the index context @new.
3407 *
3408 * Note both @new and @old must be in the same VM page.
3409 *
3410 * We need to revalidate all the pointers in @old just as if we were locking it
3411 * because @new may have been mapped into a different virtual address than @old
3412 * was last time thus the pointers in @old may be stale.
3413 *
3414 * To check if revalidation is needed is done by comparing @old->addr and
3415 * @new->addr.  If they match all is ok and if not then need to revalidate @old
3416 * with the new address of @new->addr.
3417 *
3418 * Locking: - Caller must hold @new->idx_ni->lock on the index inode.
3419 *	    - The index context @old must be locked.
3420 *	    - The index context @new must not be locked.
3421 */
3422static void ntfs_index_ctx_revalidate(ntfs_index_context *new,
3423		ntfs_index_context *old)
3424{
3425	long delta;
3426	INDEX_ENTRY **entries;
3427	unsigned u;
3428
3429	if (!new->is_locked)
3430		panic("%s(): !new->is_locked\n", __FUNCTION__);
3431	if (new->is_root)
3432		panic("%s(): new->is_root\n", __FUNCTION__);
3433	if (old->is_locked)
3434		panic("%s(): old->is_locked\n", __FUNCTION__);
3435	if (old->is_root)
3436		panic("%s(): old->is_root\n", __FUNCTION__);
3437	delta = new->addr - old->addr;
3438	if (!delta)
3439		return;
3440	/* Revalidation is needed. */
3441	old->entry = (INDEX_ENTRY*)((u8*)old->entry + delta);
3442	old->ia = (INDEX_ALLOCATION*)((u8*)old->ia + delta);
3443	old->index = &old->ia->index;
3444	old->addr = new->addr;
3445	entries = old->entries;
3446	for (u = 0; u < old->nr_entries; u++)
3447		entries[u] = (INDEX_ENTRY*)((u8*)entries[u] + delta);
3448}
3449
3450/**
3451 * ntfs_index_ctx_lock_two - lock two index contexts in a deadlock-free fashion
3452 * @a:	first index context to lock (this may be locked)
3453 * @b:	second index context to lock (this must not be locked)
3454 *
3455 * Lock the two index contexts @a and @b.  @a may already be locked and it may
3456 * need to be unlocked if it has to be locked after @b is locked.
3457 *
3458 * The following lock ordering rules are applied:
3459 *
3460 * - An index block node must be locked before an index root node.
3461 *
3462 * - Two index block nodes must be locked in ascending page offset order.
3463 *
3464 * - Two index block nodes that are physically in the same VM page must share
3465 *   the lock.  The way this is implemented is that @a is really locked and @b
3466 *   is not actually locked but all its pointers are revalidated as if it were
3467 *   locked.  This is ok because @a is locked and therefore the VM page is
3468 *   mapped, locked, and pinned and will not go anywhere until we are done.
3469 *   The reason we need to do the revalidation is because @b when it was mapped
3470 *   previously may have been mapped at a different virtual address than the
3471 *   one @a is mapped at now.  This means that when you are unlocking @b you
3472 *   must check if @b is locked and only unlock it if so.
3473 *
3474 * Return 0 on success or errno on error.  On error @a and @b may be both left
3475 * unlocked or one can be left locked and one unlocked.
3476 */
3477static errno_t ntfs_index_ctx_lock_two(ntfs_index_context *a,
3478		ntfs_index_context *b)
3479{
3480	errno_t err;
3481
3482	if (b->is_locked)
3483		panic("%s(): b->is_locked\n", __FUNCTION__);
3484	if (a->is_root) {
3485		/*
3486		 * @a is the index root so it has to be locked second.
3487		 *
3488		 * Unlock @a if it is already locked.
3489		 */
3490		if (a->is_locked)
3491			ntfs_index_ctx_unlock(a);
3492	} else if (b->is_root) {
3493		/* @b is the index root.  If @a is not locked, lock it. */
3494		if (!a->is_locked) {
3495			err = ntfs_index_ctx_relock(a);
3496			if (err)
3497				return err;
3498		}
3499	} else {
3500		/*
3501		 * Both @a and @b are index blocks.
3502		 *
3503		 * Do we need to share the lock because both index nodes are in
3504		 * the same VM page?
3505		 */
3506		if (a->upl_ofs == b->upl_ofs) {
3507			if (!a->is_locked) {
3508				err = ntfs_index_ctx_relock(a);
3509				if (err)
3510					return err;
3511			}
3512			ntfs_index_ctx_revalidate(a, b);
3513			return 0;
3514		}
3515		if (a->is_locked) {
3516			/* Do we have to unlock @a before locking @b? */
3517			if (a->upl_ofs > b->upl_ofs)
3518				ntfs_index_ctx_unlock(a);
3519		} else {
3520			/* Do we need to lock @a first? */
3521			if (a->upl_ofs < b->upl_ofs) {
3522				err = ntfs_index_ctx_relock(a);
3523				if (err)
3524					return err;
3525			}
3526		}
3527	}
3528	/* Lock @b. */
3529	err = ntfs_index_ctx_relock(b);
3530	/* If @a is currently locked or there was an error we are done. */
3531	if (a->is_locked || err)
3532		return err;
3533	/*
3534	 * We unlocked @a so we could lock @b or @a was not locked.
3535	 *
3536	 * Lock @a and we are done.
3537	 */
3538	return ntfs_index_ctx_relock(a);
3539}
3540
3541/**
3542 * ntfs_index_entry_add_or_node_split - add a key to an index
3543 * @ictx:	index context specifying the node to split/position to add at
3544 * @split_only:	if true do not insert, only split the index node
3545 * @entry_size:	size of the entry that will be added after the split
3546 * @key:	key to add to the directory or view index
3547 * @key_len:	size of key @key in bytes
3548 * @data:	data to associate with the key @key if a view index
3549 * @data_len:	size of data @data in bytes if a view index
3550 *
3551 * If @split_only is true, split the index node pointed to by @ictx.  @ictx
3552 * also points to the entry in the index node at which an entry will be
3553 * inserted after the split is completed.  @entry_size is the size of the entry
3554 * that will be added at that position.  These are used so that the split is
3555 * performed with consideration with the insertion that is likely to come.  And
3556 * if the insertion does not happen it does not matter much, it just means the
3557 * split was not quite on the median entry but off by one.
3558 *
3559 * In this case @key, @key_len, @data, and @data_len are ignored.
3560 *
3561 * If @split_only is false @entry_size is ignored and @key, @key_len, @data,
3562 * and @data_len are used.
3563 *
3564 * In this case, if @ictx belongs to a directory index, insert the filename
3565 * attribute @key of length @key_len bytes in the directory index at the
3566 * position specified by the index context @ictx and point the inserted index
3567 * entry at the mft reference *@data which is the mft reference of the inode to
3568 * which the filename @fn belongs.  @data_len must be zero in this case.
3569 *
3570 * If @ictx belongs to a view index, insert the key @key of length @key_len
3571 * bytes in the view index at the position specified by the index context @ictx
3572 * and associate the data @data of size @data_len bytes with the key @key.
3573 *
3574 * Return 0 on success and errno on error.  On error the index context is no
3575 * longer usable and must be released or reinitialized.
3576 *
3577 * Note that we do not update the array of index entry pointers nor the number
3578 * of entries in the array because on success it means that the index node has
3579 * been split and the function is done, i.e. the array of index entry pointers
3580 * will not be used any more and on error the index context becomes invalid so
3581 * there is no need to update any of the pointers.  The caller is expected to
3582 * restart its operations by doing a new index lookup if it wants to continue
3583 * working on the index for that reason.
3584 *
3585 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for
3586 *	      writing.
3587 *	    - The index context @ictx must be locked.
3588 */
3589errno_t ntfs_index_entry_add_or_node_split(ntfs_index_context *ictx,
3590		const BOOL split_only, u32 entry_size, const void *key,
3591		const u32 key_len, const void *data, const u32 data_len)
3592{
3593	VCN vcn = 0;
3594	ntfs_index_context *cur_ictx, *child_ictx;
3595	ntfs_inode *bmp_ni, *idx_ni = ictx->idx_ni;
3596	u32 data_ofs = 0;
3597	errno_t err, err2;
3598	const BOOL is_view = (idx_ni->name != I30);
3599
3600	ntfs_debug("Entering.");
3601	if (!ictx->is_locked)
3602		panic("%s(): !ictx->is_locked\n", __FUNCTION__);
3603	/*
3604	 * If this is only a split @entry_size contains the size of the entry
3605	 * and the @key and @data are not set (and not needed/used).
3606	 *
3607	 * If this is an addition, @entry_size is not set and we need to work
3608	 * it out from the supplied @key and @data.
3609	 */
3610	if (!split_only) {
3611		/*
3612		 * Calculate the size for the index entry to be added.  For a
3613		 * directory index, the mft reference @data is embedded inside
3614		 * the directory index entry thus we only need space for the
3615		 * index entry header and the filename attribute @key.
3616		 *
3617		 * For a view index, the data follows the key aligned to a
3618		 * 4-byte boundary.
3619		 *
3620		 * As an additional complication, the $SDH index in the $Secure
3621		 * system file contains an extra magic which is not counted in
3622		 * the data length @data_len so we need to add it by hand here.
3623		 */
3624		entry_size = sizeof(INDEX_ENTRY_HEADER) + key_len;
3625		if (is_view) {
3626			data_ofs = (entry_size + 4) & ~4;
3627			entry_size = data_ofs + data_len;
3628			if (idx_ni == idx_ni->vol->secure_sdh_ni)
3629				entry_size += sizeof(((SDH_INDEX_DATA*)NULL)->
3630						magic);
3631		}
3632		/*
3633		 * Align the index entry size to an 8-byte boundary and add
3634		 * another 8 bytes to the entry size if the insertion is to
3635		 * happen in an index node.
3636		 */
3637		entry_size = ((entry_size + 7) & ~7) +
3638				((ictx->index->flags & INDEX_NODE) ?
3639				sizeof(leVCN): 0);
3640	}
3641	/* Set the current entry to be the entry to be added to the index. */
3642	cur_ictx = ictx;
3643	cur_ictx->promote_inserted = 0;
3644	cur_ictx->insert_to_add = 1;
3645	cur_ictx->insert_entry_size = entry_size;
3646	/*
3647	 * We need to loop over each index node on the tree path starting at
3648	 * the bottom.  For each traversed node, we check if the current entry
3649	 * to be inserted fits and if it does not we need to split that node.
3650	 * We make the arrangements to be able to do so and then move onto the
3651	 * next node up the tree and so on until we find a node which does have
3652	 * enough space to fit the index entry to be inserted.  We can then
3653	 * break out of the loop and begin work on the actual addition of the
3654	 * entry and the splitting of the so marked nodes.
3655	 */
3656	while (cur_ictx->insert_entry_size > cur_ictx->bytes_free) {
3657		ntfs_index_context *insert_ictx;
3658		unsigned median_entry_nr, insert_entry_nr, entry_nr;
3659		u32 insert_entry_size;
3660		BOOL insert_to_add;
3661
3662		/*
3663		 * The entry to be inserted into this node @cur_ictx is
3664		 * specified by its @insert_entry and @insert_entry_size.
3665		 */
3666		if (!cur_ictx->is_locked)
3667			panic("%s(): !cur_ictx->is_locked\n", __FUNCTION__);
3668		/*
3669		 * We do not have enough space in the current node to insert
3670		 * the entry to be inserted.
3671		 *
3672		 * If the current node is the index root we need to move the
3673		 * index root to an index block and if successful restart the
3674		 * loop to determine if we now have enough space to insert the
3675		 * entry.
3676		 *
3677		 * This causes the B+tree to grow in height by one and is in
3678		 * fact the only operation that can cause the tree to grow in
3679		 * height.  All other operations can only cause the tree to
3680		 * grow in width.
3681		 */
3682		if (cur_ictx->is_root) {
3683			ntfs_debug("Moving index root into index allocation "
3684					"block.");
3685			/*
3686			 * If we speculatively locked the index node containing
3687			 * the index entry to insert into the index root,
3688			 * unlock it now so we can move the index root to an
3689			 * index block.
3690			 */
3691			insert_ictx = cur_ictx->insert_ictx;
3692			if (insert_ictx && insert_ictx->is_locked) {
3693				if (insert_ictx->is_root)
3694					panic("%s(): insert_ictx->is_root\n",
3695							__FUNCTION__);
3696				ntfs_index_ctx_unlock(insert_ictx);
3697			}
3698			err = ntfs_index_move_root_to_allocation_block(
3699					cur_ictx);
3700			if (!err)
3701				continue;
3702			ntfs_error(idx_ni->vol->mp, "Failed to move index "
3703					"root to index allocation block "
3704					"(error %d).", err);
3705			goto err;
3706		}
3707		ntfs_debug("Need to split index block with VCN 0x%llx.",
3708				(unsigned long long)cur_ictx->vcn);
3709		/*
3710		 * We do not have enough space in the current node which we now
3711		 * know is an index allocation block.  We have to split the
3712		 * node and promote the median entry into the parent node which
3713		 * may in turn involve a split.  We do not perform the split
3714		 * but instead work out what needs to be done and allocate any
3715		 * needed index blocks in advance so we cannot run out of disk
3716		 * space and/or memory half-way through and only then do we do
3717		 * the actual work on the index nodes.  This preparation
3718		 * involves the following steps:
3719		 *
3720		 * 1. Work out the median entry to be promoted into the parent
3721		 *    node of the current node and unlock the current node.
3722		 * 2. If the entry to be promoted is not the entry to be
3723		 *    inserted, make a note of the fact that the entry to be
3724		 *    inserted is to be inserted into the current node in the
3725		 *    index context.
3726		 * 3. Allocate a new index block and make a note of it in the
3727		 *    current index context.
3728		 * 4. Set the parent node as the current node.
3729		 * 5. Set the median entry up as the current entry to be
3730		 *    inserted.
3731		 *
3732		 * Then go round the loop again checking whether there is
3733		 * enough space to insert the now current entry into the now
3734		 * current node...
3735		 *
3736		 * First of all, mark the node as needing to be split.
3737		 */
3738		cur_ictx->split_node = 1;
3739		/*
3740		 * 1. Determine the median index entry.
3741		 *
3742		 * Note we exclude the end entry from the median calculation
3743		 * because it is not eligible for promotion thus we should use
3744		 * @ictx->nr_entries - 1 but we need to take into account the
3745		 * not yet inserted entry for which we are doing the split in
3746		 * the first place so that is a + 1 and we also start counting
3747		 * entries at 0 and not 1 thus we apply a - 1 for that.  Thus
3748		 * we have - 1 + 1 - 1 = -1.
3749		 */
3750		median_entry_nr = (cur_ictx->nr_entries - 1) / 2;
3751		/*
3752		 * @entry_nr is the index into the array of index entry
3753		 * pointers of the index entry in front of which the entry to
3754		 * be inserted @cur_ictx->insert_entry needs to be inserted.
3755		 */
3756		entry_nr = cur_ictx->entry_nr;
3757		/*
3758		 * If the position at which to insert the entry to be inserted
3759		 * is the median promote the entry to be inserted.  If the
3760		 * number of entries is even there are two possible medians.
3761		 * We choose the first one for simplicity.
3762		 *
3763		 * If the entry to be inserted is before the median, subtract 1
3764		 * from the median.
3765		 *
3766		 * Otherwise promote the median entry.
3767		 *
3768		 * The only exception is when @split_only is true and the
3769		 * current node is the node we are meant to split in which
3770		 * case there is no entry to be inserted and thus it cannot be
3771		 * promoted.  In this case we recalculate the median ignoring
3772		 * the entry to be inserted and promote that.
3773		 */
3774		if (entry_nr == median_entry_nr && (!split_only ||
3775				cur_ictx != ictx)) {
3776			insert_to_add = cur_ictx->insert_to_add;
3777			insert_entry_nr = cur_ictx->insert_entry_nr;
3778			insert_entry_size = cur_ictx->insert_entry_size;
3779			insert_ictx = cur_ictx->insert_ictx;
3780			/*
3781			 * 2. The entry to be promoted is the entry to be
3782			 *    inserted, make a note of that fact.
3783			 */
3784			cur_ictx->promote_inserted = 1;
3785		} else {
3786			if (entry_nr < median_entry_nr)
3787				median_entry_nr--;
3788			else if (entry_nr == median_entry_nr) {
3789				if (!split_only)
3790					panic("%s(): !split_only\n",
3791							__FUNCTION__);
3792				if (cur_ictx != ictx)
3793					panic("%s(): cur_ictx != ictx\n",
3794							__FUNCTION__);
3795				/*
3796				 * We must have at least one real entry or
3797				 * there is nothing to promote.  This cannot
3798				 * happen unless something has gone wrong.
3799				 */
3800				if (cur_ictx->nr_entries < 2)
3801					panic("%s(): cur_ictx->nr_entries < "
3802							"2\n", __FUNCTION__);
3803				median_entry_nr = (cur_ictx->nr_entries - 2) /
3804						2;
3805			}
3806			insert_to_add = FALSE;
3807			insert_entry_nr = median_entry_nr;
3808			insert_entry_size = le16_to_cpu(cur_ictx->
3809					entries[median_entry_nr]->length);
3810			insert_ictx = cur_ictx;
3811		}
3812		/*
3813		 * If this is the very first promotion and we are promoting a
3814		 * leaf entry we need to add 8 bytes to the size of the entry
3815		 * being promoted to allow for the VCN sub-node pointer that
3816		 * will be added at the end of the entry when it is inserted
3817		 * into the parent index node.
3818		 */
3819		if (cur_ictx == ictx &&
3820				(!(cur_ictx->index->flags & INDEX_NODE) ?
3821				sizeof(leVCN): 0))
3822			insert_entry_size += sizeof(VCN);
3823		/*
3824		 * Record which entry is being promoted, i.e. where we need to
3825		 * perform the split of the node.
3826		 */
3827		cur_ictx->promote_entry_nr = median_entry_nr;
3828		// TODO: Possible optimization: Allow ntfs_index_block_alloc()
3829		// to do the unlocking and thus to consume the lock if the new
3830		// index block is in the same page as the current index block.
3831		if (!cur_ictx->is_locked)
3832			panic("%s(): !cur_ictx->is_locked\n", __FUNCTION__);
3833		ntfs_index_ctx_unlock(cur_ictx);
3834		/*
3835		 * 3. Allocate a new index block and make a note of it in the
3836		 *    current index context.
3837		 *
3838		 * The call may cause the index root attribute to have its
3839		 * entries moved out to an index block.  That is fine as we
3840		 * have not looked at any of our parent entries yet and the
3841		 * index root must be above us given we are a child node.
3842		 */
3843		err = ntfs_index_block_alloc(cur_ictx, &cur_ictx->right_vcn,
3844				&cur_ictx->right_ia, &cur_ictx->right_upl_ofs,
3845				&cur_ictx->right_upl, &cur_ictx->right_pl,
3846				&cur_ictx->right_addr);
3847		if (err) {
3848			ntfs_error(idx_ni->vol->mp, "Failed to allocate index "
3849					"allocation block (error %d).", err);
3850			goto err;
3851		}
3852		ntfs_debug("Allocated index block for right-hand node with "
3853				"VCN 0x%llx.", (unsigned long long)
3854				cur_ictx->right_vcn);
3855		/* We need to unmap the newly allocated index block. */
3856		ntfs_page_unmap(idx_ni, cur_ictx->right_upl,
3857				cur_ictx->right_pl, TRUE);
3858		cur_ictx->right_upl = NULL;
3859		/* 4. Set the parent node as the current node and lock it. */
3860		cur_ictx = cur_ictx->up;
3861		if (cur_ictx->is_locked)
3862			panic("%s(): cur_ictx->is_locked\n", __FUNCTION__);
3863		if (cur_ictx->is_root) {
3864			/*
3865			 * We have reached the index root.  We speculatively
3866			 * lock the index node containing the index entry to
3867			 * insert into the index root, as we cannot lock it
3868			 * once we have locked the mft record (which happens
3869			 * when we lock the current index context below) as
3870			 * that would cause lock reversal and thus deadlocks.
3871			 */
3872			if (insert_ictx) {
3873				if (insert_ictx->is_root)
3874					panic("%s(): insert_ictx->is_root\n",
3875							__FUNCTION__);
3876				if (insert_ictx->is_locked)
3877					panic("%s(): insert_ictx->is_locked\n",
3878							__FUNCTION__);
3879				err = ntfs_index_ctx_relock(insert_ictx);
3880				if (err)
3881					goto err;
3882			}
3883		}
3884		/* Lock the current index context. */
3885		err = ntfs_index_ctx_relock(cur_ictx);
3886		if (err)
3887			goto err;
3888		/*
3889		 * 5. Set the median entry up as the current entry to be
3890		 *    inserted.
3891		 */
3892		cur_ictx->insert_to_add = insert_to_add;
3893		cur_ictx->insert_entry_nr = insert_entry_nr;
3894		cur_ictx->insert_entry_size = insert_entry_size;
3895		cur_ictx->insert_ictx = insert_ictx;
3896	}
3897	/*
3898	 * Get the child node index context if we are not at the bottom of the
3899	 * tree already or rather at the node for which we were called (which
3900	 * may not actually be at the bottom of the tree).
3901	 */
3902	child_ictx = NULL;
3903	if (cur_ictx != ictx) {
3904		child_ictx = cur_ictx->down;
3905		if (child_ictx->is_root)
3906			panic("%s(): child_ictx->is_root\n", __FUNCTION__);
3907	}
3908	/*
3909	 * If the node we were called for was the index root then it will have
3910	 * been moved to an index allocation block which will likely have
3911	 * created enough space to fit the entry to be inserted thus if this is
3912	 * a split only call nothing further needs to be done.
3913	 */
3914	if (cur_ictx == ictx && split_only) {
3915		ntfs_debug("Done (index root was upgraded to index "
3916				"allocation, no further split needed).");
3917		return 0;
3918	}
3919	/*
3920	 * We now have prepared everything so we are guaranteed not to run out
3921	 * of disk space and can begin doing the work.
3922	 *
3923	 * TODO: We can still fail if relocking of an index context fails but
3924	 * that can really only happen under extreme memory pressure and there
3925	 * is not much we can do about that without <rdar://problem/4992358>
3926	 * being implemented.  Thus for now we simply bomb out with an error
3927	 * message and leave the index potentially badly corrupted and also we
3928	 * potentially leave some allocated index blocks actually unused.  This
3929	 * is highly unsatisfactory but rolling back once the modifications
3930	 * have begun is pretty much impossible without keeping a lot more
3931	 * state so we do not implement rollback.  Once the aforementioned
3932	 * radar is implemented we will no longer suffer from this problem and
3933	 * it will no longer be possible to fail once we get here.
3934	 *
3935	 * We start with the current node which is the top-most node we need to
3936	 * deal with.  We know from above it has enough space and does not need
3937	 * to be split.
3938	 */
3939	do {
3940		ntfs_index_context *insert_ictx;
3941		INDEX_ENTRY *entry, *right_entry, *end_entry;
3942		INDEX_HEADER *right_index;
3943		unsigned u;
3944
3945		if (!cur_ictx->is_locked)
3946			panic("%s(): !cur_ictx->is_locked\n", __FUNCTION__);
3947		/*
3948		 * The current node is either the top-most node we need to deal
3949		 * with thus we know it has enough space or we have just split
3950		 * it and thus also know that it must have enough space.
3951		 *
3952		 * Thus the insertion is now a matter of doing a simple insert
3953		 * into the node unless the current node has had its entry to
3954		 * be inserted promoted into the parent node in which case
3955		 * there is nothing to insert into this node.
3956		 *
3957		 * Note: Use goto to reduce indentation.
3958		 */
3959		insert_ictx = cur_ictx->insert_ictx;
3960		if (cur_ictx->promote_inserted)
3961			goto skip_insert;
3962		/*
3963		 * We now need to begin to do the insertion but there is one
3964		 * complication we need to deal with first and that is that if
3965		 * we are inserting a promoted entry, we have to lock the index
3966		 * node it is in also.
3967		 *
3968		 * If the current node is an index block we may need to unlock
3969		 * it so that we can ensure to always take the node lock in
3970		 * ascending page offset order.  Also we have to consider the
3971		 * case where the two index nodes are in the same page in which
3972		 * case we need to share the index lock.
3973		 *
3974		 * If the current node is the index root and we are inserting a
3975		 * promoted entry, we speculatively locked the index node
3976		 * containing the entry in the above loop thus we do not need
3977		 * to do anything here.
3978		 */
3979		if (!cur_ictx->insert_to_add) {
3980			if (!insert_ictx)
3981				panic("%s(): !insert_ictx\n", __FUNCTION__);
3982			if (cur_ictx->is_root) {
3983				if (!insert_ictx->is_locked)
3984					panic("%s(): !insert_ictx->"
3985							"is_locked\n",
3986							__FUNCTION__);
3987			} else {
3988				err = ntfs_index_ctx_lock_two(cur_ictx,
3989						insert_ictx);
3990				if (err)
3991					goto relock_err;
3992			}
3993		}
3994		entry_size = cur_ictx->insert_entry_size;
3995		/*
3996		 * Everything we need is locked and we know there is enough
3997		 * space in the index node thus make space for the entry to be
3998		 * inserted at the appropriate place and then insert the index
3999		 * entry by either copying it from the appropriate, locked
4000		 * child node or by creating it in place.  The latter happens
4001		 * when the entry being inserted is the entry being added with
4002		 * this call to ntfs_index_entry_add(), i.e. it is the reason
4003		 * we have been called in the first place and thus there is no
4004		 * index entry to copy from.
4005		 */
4006		err = ntfs_index_node_make_space(cur_ictx, entry_size);
4007		if (err)
4008			panic("%s(): err\n", __FUNCTION__);
4009		/*
4010		 * We have modified the index node.  Make sure it is written
4011		 * out later.
4012		 */
4013		ntfs_index_entry_mark_dirty(cur_ictx);
4014		if (!cur_ictx->insert_to_add) {
4015			entry = insert_ictx->entries[
4016					cur_ictx->insert_entry_nr];
4017			/* Copy the index entry into the created space. */
4018			memcpy(cur_ictx->entry, entry,
4019					le16_to_cpu(entry->length));
4020			entry = cur_ictx->entry;
4021		} else {
4022			if (split_only)
4023				panic("%s(): split_only\n", __FUNCTION__);
4024			entry = cur_ictx->entry;
4025			/*
4026			 * Clear the created space so we start with a clean
4027			 * slate and do not need to worry about initializing
4028			 * all the zero fields.
4029			 */
4030			bzero(entry, entry_size);
4031			/* Create the index entry in the created space. */
4032			if (!is_view)
4033				entry->indexed_file = *(leMFT_REF*)data;
4034			else {
4035				u8 *new_data;
4036
4037				new_data = (u8*)entry + data_ofs;
4038				entry->data_offset = cpu_to_le16(data_ofs);
4039				entry->data_length = cpu_to_le16(data_len);
4040				if (data_len)
4041					memcpy(new_data, data, data_len);
4042				/*
4043				 * In the case of $Secure/$SDH we leave the
4044				 * extra magic to zero rather than setting it
4045				 * to "II" in Unicode.  This could easily be
4046				 * changed if deemed better and/or necessary by
4047				 * uncommenting the below code.
4048				 */
4049#if 0
4050				if (idx_ni == idx_ni->vol->secure_sdh_ni) {
4051					static const ntfschar SDH_magic[2] = {
4052							const_cpu_to_le16('I'),
4053							const_cpu_to_le16('I')
4054					};
4055
4056					memcpy(((SDH_INDEX_DATA*)data)->magic,
4057							SDH_magic,
4058							sizeof(SDH_magic));
4059				}
4060#endif
4061			}
4062			entry->key_length = cpu_to_le16(key_len);
4063			memcpy(&entry->key, key, key_len);
4064		}
4065		/*
4066		 * If the copied entry is a leaf entry and it is being inserted
4067		 * into a non-leaf node, we need to rewrite its size with the
4068		 * new size which includes the VCN sub-node pointer.
4069		 *
4070		 * We just overwrite the length unconditionally and use it to
4071		 * set the length of the just created index entry, too.
4072		 *
4073		 * There is no harm in doing this as we are either updating the
4074		 * size which we must do or we are overwriting the size with
4075		 * the same value it already has which has no effect.
4076		 */
4077		entry->length = cpu_to_le16(entry_size);
4078		/*
4079		 * If the current node is not a leaf node, we have to fix up
4080		 * the VCN sub-node pointers both in the entry we just inserted
4081		 * and in the entry in front of which we inserted.
4082		 *
4083		 * The inserted index entry needs to point to what will be the
4084		 * left-hand index node after our child node is split.
4085		 *
4086		 * The index entry in front of which we inserted needs to point
4087		 * to what will be the right-hand index node after our child
4088		 * node is split.
4089		 *
4090		 * The INDEX_NODE check is fine for the index root, too,
4091		 * because as it happens LARGE_INDEX == INDEX_NODE.
4092		 */
4093		if (cur_ictx->index->flags & INDEX_NODE) {
4094			if (!child_ictx)
4095				panic("%s(): !child_ictx\n", __FUNCTION__);
4096			if (entry->flags & INDEX_ENTRY_END)
4097				panic("%s(): entry->flags & INDEX_ENTRY_END\n",
4098						__FUNCTION__);
4099			entry->flags |= INDEX_ENTRY_NODE;
4100			*(leVCN*)((u8*)entry + entry_size - sizeof(VCN)) =
4101					cpu_to_sle64(child_ictx->vcn);
4102			ntfs_debug("Setting VCN sub-node pointer of inserted "
4103					"index entry to 0x%llx.",
4104					(unsigned long long)child_ictx->vcn);
4105			entry = (INDEX_ENTRY*)((u8*)entry + entry_size);
4106			if (!(entry->flags & INDEX_ENTRY_NODE))
4107				panic("%s(): !(entry->flags & "
4108						"INDEX_ENTRY_NODE)\n",
4109						__FUNCTION__);
4110			*(leVCN*)((u8*)entry + le16_to_cpu(entry->length) -
4111					sizeof(VCN)) = cpu_to_sle64(
4112					child_ictx->right_vcn);
4113			ntfs_debug("Setting VCN sub-node pointer of index "
4114					"entry after inserted entry to "
4115					"0x%llx.", (unsigned long long)
4116					child_ictx->right_vcn);
4117		}
4118skip_insert:
4119		/*
4120		 * If there are no child nodes left we are done.  We will dirty
4121		 * the index node once we have broken out of the loop.  When
4122		 * the index context is released later all locked nodes will be
4123		 * unlocked so no need to do it now.
4124		 */
4125		if (!child_ictx)
4126			break;
4127		if (!child_ictx->split_node)
4128			panic("%s(): !child_ictx->split_node\n", __FUNCTION__);
4129		/*
4130		 * TODO: @child_ictx->right_upl and @child_ictx->right_pl are
4131		 * currently not valid as @child_ictx is not locked.  Once
4132		 * <rdar://problem/4992358> is implemented we can re-enable
4133		 * this check and change the code to leave the right_upl mapped
4134		 * at all times.
4135		 */
4136#if 0
4137		if (!child_ictx->right_upl || !child_ictx->right_pl)
4138			panic("%s(): !child_ictx->right_upl ||
4139					!child_ictx->right_pl\n",
4140					__FUNCTION__);
4141#endif
4142		/*
4143		 * We are done with the current node and have a child node.
4144		 * Switch to the child node, and sort out the needed locks.
4145		 *
4146		 * First, unlock the @insert_ictx node if it exists and is
4147		 * locked.
4148		 *
4149		 * Note we do not bother with trying to transfer the lock from
4150		 * @insert_ictx onto @child_ictx or @child_ictx->right_*
4151		 * because index blocks are 4096 bytes in size in majority of
4152		 * cases and the PAGE_SIZE is 4096 bytes both on x86 and PPC
4153		 * thus in majority of cases each page will contain a separate
4154		 * index block thus no sharing will be possible and there is no
4155		 * point in adding extra complexity for an extremely unlikely
4156		 * event.
4157		 */
4158		if (insert_ictx && insert_ictx->is_locked)
4159			ntfs_index_ctx_unlock(insert_ictx);
4160		/*
4161		 * Unlock the current node.  Again we do not bother with trying
4162		 * to share the lock with @child_ictx or @child_ictx->right_*.
4163		 */
4164		ntfs_index_ctx_unlock(cur_ictx);
4165		/* Set the child node to be the current node. */
4166		cur_ictx = child_ictx;
4167		/*
4168		 * We need to ensure both the current node and its right-hand
4169		 * node are locked.  Both are currently unlocked.
4170		 *
4171		 * If both nodes share the same page, lock the current node and
4172		 * share the lock with the right one.
4173		 */
4174		if (cur_ictx->is_locked)
4175			panic("%s(): cur_ictx->is_locked\n", __FUNCTION__);
4176		if (cur_ictx->right_is_locked)
4177			panic("%s(): cur_ictx->right_is_locked\n",
4178					__FUNCTION__);
4179		if (cur_ictx->upl_ofs <= cur_ictx->right_upl_ofs) {
4180			err = ntfs_index_ctx_relock(cur_ictx);
4181			if (err)
4182				goto relock_err;
4183		}
4184		if (cur_ictx->upl_ofs == cur_ictx->right_upl_ofs) {
4185			cur_ictx->right_ia = (INDEX_ALLOCATION*)(
4186					(u8*)cur_ictx->right_ia +
4187					(cur_ictx->addr -
4188					cur_ictx->right_addr));
4189			cur_ictx->right_addr = cur_ictx->addr;
4190		} else {
4191			u8 *addr;
4192
4193			err = ntfs_page_map(idx_ni, cur_ictx->right_upl_ofs,
4194					&cur_ictx->right_upl,
4195					&cur_ictx->right_pl, &addr, TRUE);
4196			if (err) {
4197				ntfs_error(idx_ni->vol->mp, "Failed to map "
4198						"index page (error %d).", err);
4199				cur_ictx->right_upl = NULL;
4200				goto relock_err;
4201			}
4202			cur_ictx->right_is_locked = 1;
4203			cur_ictx->right_ia = (INDEX_ALLOCATION*)(
4204					(u8*)cur_ictx->right_ia + (addr -
4205					cur_ictx->right_addr));
4206			cur_ictx->right_addr = addr;
4207		}
4208		if (!cur_ictx->is_locked) {
4209			err = ntfs_index_ctx_relock(cur_ictx);
4210			if (err) {
4211				if (cur_ictx->right_is_locked) {
4212					ntfs_page_unmap(idx_ni,
4213							cur_ictx->right_upl,
4214							cur_ictx->right_pl,
4215							FALSE);
4216					cur_ictx->right_upl = NULL;
4217					cur_ictx->right_is_locked = 0;
4218				}
4219				goto relock_err;
4220			}
4221		}
4222		/*
4223		 * Having obtained the needed locks, we can now split the
4224		 * current node.
4225		 *
4226		 * We have recorded the split point in @promote_entry_nr and
4227		 * @promote_inserted tells us whether to remove the entry
4228		 * specified by @promote_entry_nr and move all entries after it
4229		 * to the right-hand node (@promote_inserted is 0) or whether
4230		 * to move the entry specified by @promote_entry_nr and all
4231		 * entries after it to the right-hand node (@promote_inserted
4232		 * is 1).
4233		 *
4234		 * The split results in the creation of a new end entry in the
4235		 * left-hand node as its old end entry is moved to the right
4236		 * hand node.  This means we need to determine what we need to
4237		 * set the VCN sub-node pointer to if this is an index node.
4238		 *
4239		 * If @promote_iserted is 0, i.e. we are promoting an existing
4240		 * entry from this node, we need to use the VCN sub-node
4241		 * pointer of the entry we are about to promote.  Because we
4242		 * are promoting the entry it is going to disappear altogether
4243		 * from this node thus we need to make a note of its sub-node
4244		 * VCN pointer if it has one now before doing the actual split.
4245		 *
4246		 * In this case we do not need to modify the VCN sub-node
4247		 * pointer of the first entry in the (new) right-hand node as
4248		 * it does not change.
4249		 *
4250		 * If @promote_inserted is 1, i.e. we are promoting the entry
4251		 * that was going to be inserted into this node, we need to use
4252		 * the VCN of our child node which is the VCN of the entry in
4253		 * front of which we are inserting and splitting, i.e. the VCN
4254		 * of the first entry we are about to move to the right-hand
4255		 * node.
4256		 *
4257		 * In this case we also need to modify the VCN sub-node pointer
4258		 * of the first entry in the (new) right-hand node to point to
4259		 * the (new) right-hand child node.  We do not know what that
4260		 * is yet so we determine it later once we have obtained our
4261		 * child node index context containing the needed information.
4262		 *
4263		 * First, copy the appropriate entries to the right-hand node
4264		 * and switch the node to be an index, i.e. non-leaf, node if
4265		 * the node being split is an index node.
4266		 */
4267		right_index = &cur_ictx->right_ia->index;
4268		right_entry = (INDEX_ENTRY*)((u8*)right_index +
4269				le32_to_cpu(right_index->entries_offset));
4270		u = cur_ictx->promote_entry_nr;
4271		entry = cur_ictx->entries[u];
4272		if (!cur_ictx->promote_inserted) {
4273			if (entry->flags & INDEX_ENTRY_NODE) {
4274				if (!(cur_ictx->index->flags & INDEX_NODE))
4275					panic("%s(): !(cur_ictx->index->flags "
4276							"& INDEX_NODE)\n",
4277							__FUNCTION__);
4278				vcn = sle64_to_cpu(*(leVCN*)((u8*)entry +
4279						le16_to_cpu(entry->length) -
4280						sizeof(VCN)));
4281			}
4282			u++;
4283			entry = cur_ictx->entries[u];
4284		}
4285		end_entry = cur_ictx->entries[cur_ictx->nr_entries - 1];
4286		if (!(end_entry->flags & INDEX_ENTRY_END))
4287			panic("%s(): !(end_entry->flags & INDEX_ENTRY_END)\n",
4288					__FUNCTION__);
4289		u = (u8*)end_entry - (u8*)entry +
4290				le16_to_cpu(end_entry->length);
4291		memcpy(right_entry, entry, u);
4292		right_index->index_length = cpu_to_le32(
4293				le32_to_cpu(right_index->entries_offset) + u);
4294		right_index->flags = cur_ictx->index->flags;
4295		/*
4296		 * Move the end entry of the left-hand node forward thus
4297		 * truncating the left-hand node.
4298		 */
4299		if (!cur_ictx->promote_inserted)
4300			entry = cur_ictx->entries[cur_ictx->promote_entry_nr];
4301		if (entry != end_entry) {
4302			u = le16_to_cpu(end_entry->length);
4303			memmove(entry, end_entry, u);
4304			u += (u8*)entry - (u8*)cur_ictx->index;
4305			cur_ictx->index->index_length = cpu_to_le32(u);
4306		}
4307		/*
4308		 * If the current, left-hand node is not a leaf node, we have
4309		 * to replace the VCN sub-node pointer in its end entry.
4310		 *
4311		 * If @promote_iserted is 0, we use the VCN we made a note of
4312		 * above.
4313		 *
4314		 * If @promote_inserted is 1, we take the VCN of the left-hand
4315		 * node of our child node.
4316		 *
4317		 * A side effect of getting the child node in loop scope here
4318		 * is that we do not need to reget it when we go round the loop
4319		 * again which is when we need to know the child node in order
4320		 * to be able to insert the appropriate entry into the current
4321		 * node.
4322		 */
4323		child_ictx = NULL;
4324		if (entry->flags & INDEX_ENTRY_NODE) {
4325			if (cur_ictx != ictx) {
4326				child_ictx = cur_ictx->down;
4327				if (child_ictx->is_root)
4328					panic("%s(): child_ictx->is_root\n",
4329							__FUNCTION__);
4330			}
4331			if (cur_ictx->promote_inserted) {
4332				if (!child_ictx)
4333					panic("%s(): !child_ictx\n",
4334							__FUNCTION__);
4335				/*
4336				 * As described take the VCN of the left-hand
4337				 * node of our child node index context for the
4338				 * new end entry.
4339				 */
4340				vcn = child_ictx->vcn;
4341				/*
4342				 * Again, as described, update the VCN of the
4343				 * first entry we just moved to the (new) right
4344				 * hand node to the right-hand node of our
4345				 * child node index context.
4346				 */
4347				*(leVCN*)((u8*)right_entry + le16_to_cpu(
4348						right_entry->length) -
4349						sizeof(VCN)) = cpu_to_sle64(
4350						child_ictx->right_vcn);
4351				ntfs_debug("Setting VCN sub-node pointer of "
4352						"first index entry of "
4353						"right-hand index block node "
4354						"after splitting it off from "
4355						"the left-hand node to "
4356						"0x%llx.", (unsigned long long)
4357						child_ictx->right_vcn);
4358			}
4359			if (!(cur_ictx->index->flags & INDEX_NODE))
4360				panic("%s(): !(cur_ictx->index->flags & "
4361						"INDEX_NODE)\n", __FUNCTION__);
4362			*(leVCN*)((u8*)entry + le16_to_cpu(entry->length) -
4363					sizeof(VCN)) = cpu_to_sle64(vcn);
4364			ntfs_debug("Setting VCN sub-node pointer of end index "
4365					"entry of left-hand index block node "
4366					"after splitting off the right-hand "
4367					"node to 0x%llx.", (unsigned long long)
4368					vcn);
4369		}
4370		/*
4371		 * The index context still describes a single node thus if we
4372		 * are going to insert an entry into either of the two split
4373		 * nodes we have to update the index context appropriately.
4374		 *
4375		 * If @cur_ictx->entry_nr <= @cur_ictx->promote_entry_nr, we
4376		 * have to insert the entry into the left-hand node and if
4377		 * @cur_ictx->entry_nr > @cur_ictx->promote_entry_nr we have to
4378		 * insert the entry into the right-hand node.
4379		 *
4380		 * If inserting into the left-hand node we do not need to do
4381		 * anything as the insertion process does not make use of
4382		 * anything in the index context that has changed.
4383		 *
4384		 * If inserting into the right-hand node we have to switch the
4385		 * index context to describe the right-hand node and place the
4386		 * left-hand node into the place of the right-hand node, i.e.
4387		 * swap the left- and right-hand nodes in the index context.
4388		 *
4389		 * If we are switching the left- and right-hand nodes, we also
4390		 * have to update the index entry pointer to point to the
4391		 * correct insertion location in the now current page.
4392		 *
4393		 * Note we do not bother to update the array of index entry
4394		 * pointers as that is no longer used.
4395		 */
4396		if (!cur_ictx->promote_inserted && cur_ictx->entry_nr >
4397				cur_ictx->promote_entry_nr) {
4398			union {
4399				VCN vcn;
4400				INDEX_BLOCK *ia;
4401				s64 upl_ofs;
4402				upl_t upl;
4403				upl_page_info_array_t pl;
4404				u8 *addr;
4405			} tmp;
4406
4407			tmp.vcn = cur_ictx->right_vcn;
4408			cur_ictx->right_vcn = cur_ictx->vcn;
4409			cur_ictx->vcn = tmp.vcn;
4410			tmp.ia = cur_ictx->right_ia;
4411			cur_ictx->ia = tmp.ia;
4412			cur_ictx->right_ia = cur_ictx->ia;
4413			cur_ictx->index = right_index = &tmp.ia->index;
4414			if (cur_ictx->right_is_locked) {
4415				tmp.upl_ofs = cur_ictx->right_upl_ofs;
4416				cur_ictx->right_upl_ofs = cur_ictx->upl_ofs;
4417				cur_ictx->upl_ofs = tmp.upl_ofs;
4418				tmp.upl = cur_ictx->right_upl;
4419				cur_ictx->right_upl = cur_ictx->upl;
4420				cur_ictx->upl = tmp.upl;
4421				tmp.pl = cur_ictx->right_pl;
4422				cur_ictx->right_pl = cur_ictx->pl;
4423				cur_ictx->pl = tmp.pl;
4424				tmp.addr = cur_ictx->right_addr;
4425				cur_ictx->right_addr = cur_ictx->addr;
4426				cur_ictx->addr = tmp.addr;
4427			}
4428			/*
4429			 * Get the location in the left-hand page of the first
4430			 * entry that was moved to the right-hand page.
4431			 */
4432			entry = cur_ictx->entries[cur_ictx->promote_entry_nr +
4433					1];
4434			cur_ictx->entry = (INDEX_ENTRY*)((u8*)right_index +
4435					le32_to_cpu(right_index->
4436					entries_offset) +
4437					((u8*)cur_ictx->entry - (u8*)entry));
4438		}
4439		/*
4440		 * We are done with the node that is now set up as the
4441		 * right-hand node.  Unless it is sharing the lock with the
4442		 * left-hand node, unmap/release it marking it dirty so it gets
4443		 * written out later.
4444		 */
4445		if (cur_ictx->right_is_locked) {
4446			ntfs_page_unmap(idx_ni, cur_ictx->right_upl,
4447					cur_ictx->right_pl, TRUE);
4448			cur_ictx->right_upl = NULL;
4449			cur_ictx->right_is_locked = 0;
4450		}
4451		/*
4452		 * We may be done with the current node.  Mark it dirty so it
4453		 * gets written out later.
4454		 */
4455		ntfs_index_entry_mark_dirty(cur_ictx);
4456		/*
4457		 * If we have reached the original node (@child_ictx will be
4458		 * NULL) and we are only splitting it there is nothing to
4459		 * insert and hence nothing at all left to do so we break out
4460		 * of the loop.
4461		 */
4462	} while (child_ictx || !split_only);
4463	ntfs_debug("Done (%s).", cur_ictx->split_node ?
4464			"insert with split" : "simple insert");
4465	return 0;
4466err:
4467	if (!NInoIndexAllocPresent(idx_ni))
4468		goto err_out;
4469	/*
4470	 * Unlock all index contexts in the B+tree path otherwise the call to
4471	 * ntfs_attr_inode_get() can deadlock.
4472	 */
4473	ntfs_index_ctx_path_unlock(ictx);
4474	/* Get the index bitmap inode. */
4475	err2 = ntfs_attr_inode_get(ictx->base_ni, AT_BITMAP, idx_ni->name,
4476			idx_ni->name_len, FALSE, LCK_RW_TYPE_SHARED, &bmp_ni);
4477	if (err2) {
4478		ntfs_error(idx_ni->vol->mp, "Failed to get index bitmap inode "
4479				"(error %d).  Cannot undo index block "
4480				"allocation.  Leaving inconsistent metadata.  "
4481				"Run chkdsk.", err2);
4482		NVolSetErrors(idx_ni->vol);
4483		goto err_out;
4484	}
4485	/* Free all the index block allocations we have done. */
4486	do {
4487		if (cur_ictx->right_addr) {
4488			if (!cur_ictx->right_ia)
4489				panic("%s(): !cur_ictx->right_ia\n",
4490						__FUNCTION__);
4491			if (cur_ictx->right_vcn < 0)
4492				panic("%s(): cur_ictx->right_vcn < 0\n",
4493						__FUNCTION__);
4494			err2 = ntfs_bitmap_clear_bit(bmp_ni,
4495					cur_ictx->right_vcn <<
4496					idx_ni->vcn_size_shift >>
4497					idx_ni->block_size_shift);
4498			if (err2) {
4499				ntfs_error(idx_ni->vol->mp, "Failed to undo "
4500						"index block allocation in "
4501						"(error %d).  Leaving "
4502						"inconsistent metadata.  Run "
4503						"chkdsk.", err2);
4504				NVolSetErrors(idx_ni->vol);
4505			}
4506		}
4507		/* When we have dealt with the bottom entry we are done. */
4508		if (cur_ictx == ictx)
4509			break;
4510		cur_ictx = cur_ictx->down;
4511	} while (1);
4512	lck_rw_unlock_shared(&bmp_ni->lock);
4513	(void)vnode_put(bmp_ni->vn);
4514err_out:
4515	ntfs_error(idx_ni->vol->mp, "Failed (error %d).", err);
4516	return err;
4517relock_err:
4518	ntfs_error(idx_ni->vol->mp, "Failed to relock index context (error "
4519			"%d).  Leaving corrupt index.  Run chkdsk.", err);
4520	NVolSetErrors(idx_ni->vol);
4521	return err;
4522}
4523
4524/**
4525 * ntfs_index_node_split - split an index node
4526 * @ictx:	index context specifying the node to split
4527 * @entry_size:	size of the entry that will be added after the split
4528 *
4529 * Split the index node pointed to by @ictx.  @ictx also points to the entry
4530 * in the index node at which an entry will be inserted after the split is
4531 * completed.  @entry_size is the size of the entry that will be added at that
4532 * position.  These are used so that the split is performed with consideration
4533 * with the insertion that is likely to come.  And if the insertion does not
4534 * happen it does not matter much, it just means the split was not quite on the
4535 * median entry but off by one.
4536 *
4537 * Return 0 on success and errno on error.  On error the index context is no
4538 * longer usable and must be released or reinitialized.
4539 *
4540 * Note that we do not update the array of index entry pointers nor the number
4541 * of entries in the array because on success it means that the index node has
4542 * been split and the function is done, i.e. the array of index entry pointers
4543 * will not be used any more and on error the index context becomes invalid so
4544 * there is no need to update any of the pointers.  The caller is expected to
4545 * restart its operations by doing a new index lookup for that reason.
4546 *
4547 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for
4548 *	      writing.
4549 *	    - The index context @ictx must be locked.
4550 */
4551static inline errno_t ntfs_index_node_split(ntfs_index_context *ictx,
4552		const u32 entry_size)
4553{
4554	return ntfs_index_entry_add_or_node_split(ictx, TRUE, entry_size,
4555			NULL, 0, NULL, 0);
4556}
4557
4558/**
4559 * ntfs_index_lookup_predecessor - index node whose predecessor node to return
4560 * @ictx:	index context whose predecessor node to return
4561 * @pred_ictx:	pointer in which to return the found predecessor index context
4562 *
4563 * Descend into the child node pointed to by @ictx->entry and then keep
4564 * descending into the child node of the child node pointed to by the end entry
4565 * of the child node until we reach the bottom of the B+tree.
4566 *
4567 * On success return the predecessor entry, i.e. the last real (non-end) entry
4568 * in the found leaf index node in *@pred_ictx.
4569 *
4570 * Return 0 on success and errno on error.
4571 *
4572 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode.
4573 *	    - The index context @ictx must be locked.
4574 */
4575static errno_t ntfs_index_lookup_predecessor(ntfs_index_context *ictx,
4576		ntfs_index_context **pred_ictx)
4577{
4578	unsigned entry_nr;
4579	errno_t err;
4580
4581	ntfs_debug("Entering.");
4582	if (!pred_ictx)
4583		panic("%s(): !pred_ictx\n", __FUNCTION__);
4584	if (!(ictx->entry->flags & INDEX_ENTRY_NODE))
4585		panic("%s(): !(ictx->entry->flags & INDEX_ENTRY_NODE)\n",
4586				__FUNCTION__);
4587	/*
4588	 * We must be at the bottom of the current tree path or things will get
4589	 * very confused.
4590	 */
4591	if (!ictx->down->is_root)
4592		panic("%s(): !ictx->down->is_root\n", __FUNCTION__);
4593	do {
4594		err = ntfs_index_descend_into_child_node(&ictx);
4595		if (err)
4596			goto err;
4597		/* If this child node is a leaf node we are done. */
4598		if (!(ictx->index->flags & INDEX_NODE))
4599			break;
4600		/*
4601		 * This child node is an index node, descend into its end
4602		 * entry.
4603		 */
4604		ictx->entry_nr = entry_nr = ictx->nr_entries - 1;
4605		ictx->follow_entry = ictx->entries[entry_nr];
4606	} while (1);
4607	/*
4608	 * We found the leaf node thus the predecessor entry we are looking for
4609	 * is the last entry before the end entry.
4610	 */
4611	if (ictx->nr_entries < 2)
4612		panic("%s(): ictx->nr_entries < 2\n", __FUNCTION__);
4613	ictx->entry_nr = entry_nr = ictx->nr_entries - 2;
4614	ictx->entry = ictx->entries[entry_nr];
4615	ictx->is_match = 1;
4616	*pred_ictx = ictx;
4617	ntfs_debug("Done (found).");
4618	return 0;
4619err:
4620	ntfs_error(ictx->idx_ni->vol->mp, "Failed to descend into child node "
4621			"(error %d).", err);
4622	return err;
4623}
4624
4625/**
4626 * ntfs_index_ctx_move - move an index context from its tree path to another one
4627 * @ictx:	index context to move
4628 * @dst:	destination index context below which to insert @ictx
4629 *
4630 * Disconnect the index context @ictx from its tree path and insert it into the
4631 * tree path to which @dst belongs positioning it immediately below the index
4632 * context @dst.
4633 *
4634 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode.
4635 */
4636static inline void ntfs_index_ctx_move(ntfs_index_context *ictx,
4637		ntfs_index_context *dst)
4638{
4639	ntfs_index_ctx_disconnect(ictx);
4640	dst->down->up = ictx;
4641	ictx->down = dst->down;
4642	ictx->up = dst;
4643	dst->down = ictx;
4644}
4645
4646/**
4647 * ntfs_index_root_prepare_replace - prepare an index entry to be replaced
4648 * @ictx:		existing index entry that is going to be replaced
4649 * @new_ie_size:	size in bytes of the new index entry
4650 *
4651 * Resize the existing index entry to the size of the new index entry so that
4652 * the index root is all set up and ready to receive the new entry.
4653 *
4654 * Return 0 on success and ENOSPC if there is not enough space in the index mft
4655 * record for the new entry.
4656 *
4657 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for
4658 *	      writing.
4659 *	    - The index context @ictx must be locked.
4660 */
4661static errno_t ntfs_index_root_prepare_replace(ntfs_index_context *ictx,
4662		const unsigned new_ie_size)
4663{
4664	MFT_RECORD *m = ictx->actx->m;
4665	INDEX_ENTRY *ie = ictx->entry;
4666	const u32 muse = le32_to_cpu(m->bytes_in_use);
4667	const unsigned ie_size = le16_to_cpu(ie->length);
4668	const int size_change = (int)new_ie_size - (int)ie_size;
4669	const u32 new_muse = muse + size_change;
4670
4671	ntfs_debug("Entering.");
4672	if (new_muse <= le32_to_cpu(m->bytes_allocated)) {
4673		u8 *vcn_addr;
4674		ATTR_RECORD *a;
4675		INDEX_HEADER *ih;
4676
4677		/*
4678		 * Resize the index root attribute so it has the appropriate
4679		 * space for the new index entry to replace the existing entry.
4680		 * As an optimization we combine the resizing of the index root
4681		 * attribute and the moving of index entries within the
4682		 * attribute into a single operation.
4683		 */
4684		vcn_addr = (u8*)ie + ie_size - sizeof(VCN);
4685		memmove((u8*)ie + new_ie_size - sizeof(VCN), vcn_addr,
4686				muse - (vcn_addr - (u8*)m));
4687		/* Adjust the mft record to reflect the change in used space. */
4688		m->bytes_in_use = cpu_to_le32(new_muse);
4689		/*
4690		 * Adjust the attribute record to reflect the changes in the
4691		 * size of the attribute record and in the size of the
4692		 * attribute value.
4693		 */
4694		a = ictx->actx->a;
4695		a->length = cpu_to_le32(le32_to_cpu(a->length) + size_change);
4696		a->value_length = cpu_to_le32(le32_to_cpu(a->value_length) +
4697				size_change);
4698		/* Adjust the index header to reflect the change in length. */
4699		ih = ictx->index;
4700		ih->allocated_size = ih->index_length = cpu_to_le32(
4701				le32_to_cpu(ih->index_length) + size_change);
4702		ntfs_debug("Done.");
4703		return 0;
4704	}
4705	ntfs_debug("Failed (not enough space in mft record to replace index "
4706			"entry in index root attribute).");
4707	return ENOSPC;
4708}
4709
4710/**
4711 * ntfs_index_block_prepare_replace - prepare an index entry to be replaced
4712 * @ictx:		existing index entry that is going to be replaced
4713 * @new_ie_size:	size in bytes of the new index entry
4714 *
4715 * Resize the existing index entry to the size of the new index entry so that
4716 * the index node is all set up and ready to receive the new entry.
4717 *
4718 * Return 0 on success and ENOSPC if there is not enough space in the index
4719 * node for the new entry.
4720 *
4721 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for
4722 *	      writing.
4723 *	    - The index context @ictx must be locked.
4724 */
4725static errno_t ntfs_index_block_prepare_replace(ntfs_index_context *ictx,
4726		const unsigned new_ie_size)
4727{
4728	INDEX_HEADER *ih = ictx->index;
4729	INDEX_ENTRY *ie = ictx->entry;
4730	const u32 ilen = le32_to_cpu(ih->index_length);
4731	const unsigned ie_size = le16_to_cpu(ie->length);
4732	const u32 new_ilen = ilen + new_ie_size - ie_size;
4733
4734	ntfs_debug("Entering.");
4735	if (new_ilen <= le32_to_cpu(ih->allocated_size)) {
4736		u8 *vcn_addr;
4737
4738		/*
4739		 * Move the VCN of the index entry to be replaced and
4740		 * everything that follows it to adapt the space for the new
4741		 * entry.
4742		 */
4743		vcn_addr = (u8*)ie + ie_size - sizeof(VCN);
4744		memmove((u8*)ie + new_ie_size - sizeof(VCN), vcn_addr,
4745				ilen - (vcn_addr - (u8*)ih));
4746		/* Adjust the index header to reflect the change in length. */
4747		ih->index_length = cpu_to_le32(new_ilen);
4748		ntfs_debug("Done.");
4749		return 0;
4750	}
4751	ntfs_debug("Failed (not enough space in index block to replace index "
4752			"entry).");
4753	return ENOSPC;
4754}
4755
4756/**
4757 * ntfs_index_entry_replace - replace an existing index entry with a new one
4758 * @ictx:	existing index entry to replace
4759 * @new_ictx:	new index entry to replace the existing entry with
4760 *
4761 * Replace the existing node index entry @ictx->entry with the leaf index entry
4762 * @new_ictx->entry.
4763 *
4764 * Return 0 on success and ENOSPC if there is not enough space for the new
4765 * entry.
4766 *
4767 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for
4768 *	      writing.
4769 *	    - The index context @ictx must be locked.
4770 *	    - The index context @new_ictx must be locked.
4771 */
4772static errno_t ntfs_index_entry_replace(ntfs_index_context *ictx,
4773		ntfs_index_context *new_ictx)
4774{
4775	INDEX_ENTRY *new_ie = new_ictx->entry;
4776	INDEX_ENTRY *ie = ictx->entry;
4777	const unsigned new_ie_size = le16_to_cpu(new_ie->length) + sizeof(VCN);
4778	errno_t err;
4779
4780	if (!ictx->is_match || !new_ictx->is_match)
4781		panic("%s(): !ictx->is_match || !new_ictx->is_match\n",
4782				__FUNCTION__);
4783	/* The destination entry is meant to be a node entry. */
4784	if (!(ictx->entry->flags & INDEX_ENTRY_NODE))
4785		panic("%s(): !(ictx->entry->flags & INDEX_ENTRY_NODE)\n",
4786				__FUNCTION__);
4787	/* The new entry is meant to be a leaf entry. */
4788	if (new_ictx->entry->flags & INDEX_ENTRY_NODE)
4789		panic("%s(): new_ictx->entry->flags & INDEX_ENTRY_NODE\n",
4790				__FUNCTION__);
4791	if (ictx->is_root)
4792		err = ntfs_index_root_prepare_replace(ictx, new_ie_size);
4793	else
4794		err = ntfs_index_block_prepare_replace(ictx, new_ie_size);
4795	if (!err) {
4796		/* Copy the new index entry into the adapted space. */
4797		memcpy(ie, new_ie, new_ie_size - sizeof(VCN));
4798		/*
4799		 * Update the copied index entry to reflect the fact that it is
4800		 * now an index node entry and has the VCN sub-node pointer at
4801		 * its tail.
4802		 */
4803		ie->length = cpu_to_le16(new_ie_size);
4804		ie->flags |= INDEX_ENTRY_NODE;
4805		/* Ensure the updates are written to disk. */
4806		ntfs_index_entry_mark_dirty(ictx);
4807	}
4808	return err;
4809}
4810
4811/**
4812 * ntfs_index_block_free - free an index allocation block
4813 * @ictx:	index context of the index block to deallocate
4814 *
4815 * Deallocate the index allocation block for the index described by the index
4816 * context @ictx and invalidate the context so the caller can safely release
4817 * it.
4818 *
4819 * We also check if the index allocation attribute can be shrunk as a
4820 * consequence of the deallocation of the index allocation block and if so and
4821 * that would actually change the on-disk size of the attribute we shrink it
4822 * now.
4823 *
4824 * If we shrunk the index allocation attribute and the index bitmap attribute
4825 * is non-resident we shrink the index bitmap attribute also but again only if
4826 * it would actually change the on-disk size of the attribute.
4827 *
4828 * Return 0 on success and errno on error.
4829 *
4830 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for
4831 *	      writing.
4832 *	    - All of the index contexts in the index must be unlocked (this
4833 *	      includes @ictx, i.e. @ictx must not be locked).
4834 */
4835static errno_t ntfs_index_block_free(ntfs_index_context *ictx)
4836{
4837	s64 target_pos, bmp_pos, alloc_size;
4838	ntfs_inode *bmp_ni, *idx_ni = ictx->idx_ni;
4839	ntfs_volume *vol = idx_ni->vol;
4840	unsigned page_ofs;
4841	errno_t err;
4842
4843	ntfs_debug("Entering.");
4844	if (ictx->is_locked)
4845		panic("%s(): ictx->is_locked\n", __FUNCTION__);
4846	/* Get the index bitmap inode. */
4847	err = ntfs_attr_inode_get(ictx->base_ni, AT_BITMAP, idx_ni->name,
4848			idx_ni->name_len, FALSE, LCK_RW_TYPE_EXCLUSIVE,
4849			&bmp_ni);
4850	if (err) {
4851		ntfs_error(vol->mp, "Failed to get index bitmap inode (error "
4852				"%d).", err);
4853		return err;
4854	}
4855	/*
4856	 * Zero the bit in the index bitmap corresponding to the index block
4857	 * being deallocated.
4858	 */
4859	target_pos = ictx->vcn << idx_ni->vcn_size_shift >>
4860			idx_ni->block_size_shift;
4861	err = ntfs_bitmap_clear_bit(bmp_ni, target_pos);
4862	if (err) {
4863		ntfs_error(vol->mp, "Failed to deallocate index block in "
4864				"index bitmap (error %d).", err);
4865		lck_rw_unlock_exclusive(&bmp_ni->lock);
4866		(void)vnode_put(bmp_ni->vn);
4867		return err;
4868	}
4869	/* If this is not the last set bit, we are done. */
4870	if (target_pos < idx_ni->last_set_bit) {
4871done:
4872		ntfs_debug("Done (index records are allocated beyond the "
4873				"deallocated one).");
4874out:
4875		lck_rw_unlock_exclusive(&bmp_ni->lock);
4876		(void)vnode_put(bmp_ni->vn);
4877		return 0;
4878	}
4879	/*
4880	 * Scan backwards through the entire bitmap looking for the last set
4881	 * bit.  If we do not know the old last set bit (@idx_ni->last_set_bit
4882	 * is -1), start at the end of the bitmap and if we do know it but we
4883	 * cleared it just now, start at the old last set bit.
4884	 *
4885	 * Note we ignore any errors as the truncation is just a disk saving
4886	 * optimization and is not actually required.  However if an error
4887	 * occurs we invalidate the last set bit stored in the inode.
4888	 */
4889	if (idx_ni->last_set_bit >= 0)
4890		bmp_pos = idx_ni->last_set_bit >> 3;
4891	else {
4892		lck_spin_lock(&bmp_ni->size_lock);
4893		bmp_pos = bmp_ni->initialized_size - 1;
4894		lck_spin_unlock(&bmp_ni->size_lock);
4895	}
4896	do {
4897		upl_t upl;
4898		upl_page_info_array_t pl;
4899		u8 *bmp_start, *bmp;
4900
4901		err = ntfs_page_map(bmp_ni, bmp_pos & ~PAGE_MASK_64, &upl,
4902				&pl, &bmp_start, FALSE);
4903		if (err) {
4904			ntfs_debug("Failed to read index bitmap (error %d).",
4905					err);
4906			idx_ni->last_set_bit = -1;
4907			goto out;
4908		}
4909		page_ofs = (unsigned)bmp_pos & PAGE_MASK;
4910		bmp = bmp_start + page_ofs;
4911		/* Scan backwards through the page. */
4912		do {
4913			unsigned bit, byte = *bmp;
4914			/* If this byte is zero skip it. */
4915			if (!byte)
4916				continue;
4917			/*
4918			 * Determine the last set bit in the byte.
4919			 *
4920			 * TODO: There does not appear to be a fls() function
4921			 * in the kernel. )-:  If/when the kernel has an fls()
4922			 * function, switch the below code to use it.
4923			 *
4924			 * So we do the "bit = fls(byte) - 1" by hand which is
4925			 * not very efficient but works.
4926			 */
4927			bit = 0;
4928			if (byte & 0xf0) {
4929				byte >>= 4;
4930				bit += 4;
4931			}
4932			if (byte & 0x0c) {
4933				byte >>= 2;
4934				bit += 2;
4935			}
4936			if (byte & 0x02)
4937				bit++;
4938			ntfs_page_unmap(bmp_ni, upl, pl, FALSE);
4939			/*
4940			 * @bit now contains the last set bit in the byte thus
4941			 * we can determine the last set bit in the bitmap.
4942			 */
4943			idx_ni->last_set_bit = (((bmp_pos & ~PAGE_MASK_64) +
4944					(bmp - bmp_start)) << 3) + bit;
4945			if (target_pos < idx_ni->last_set_bit)
4946				goto done;
4947			goto was_last_set_bit;
4948		} while (--bmp >= bmp_start);
4949		ntfs_page_unmap(bmp_ni, upl, pl, FALSE);
4950	} while ((bmp_pos -= page_ofs + 1) >= 0);
4951	/*
4952	 * We scanned the entire bitmap and it was all zero.  We do not do
4953	 * anything because truncation of indexes that become empty is done
4954	 * elsewhere.
4955	 */
4956	idx_ni->last_set_bit = -1;
4957	ntfs_debug("Done (index bitmap has no set bits left).");
4958	goto out;
4959was_last_set_bit:
4960	/*
4961	 * This was the last set bit.  Check if we would save disk space by
4962	 * truncating the index allocation attribute and if so do it.  To do
4963	 * this determine which the first unused cluster is and compare it
4964	 * against the currently allocated last cluster.
4965	 *
4966	 * Note we ignore any errors because it is not essential to resize the
4967	 * index allocation attribute.  In fact Windows and chkdsk are
4968	 * perfectly happy with it remaining allocated.  It just means the
4969	 * index is wasting space on disk and that will be reclaimed when the
4970	 * index is deleted or when the index is filled again with entries.
4971	 */
4972	target_pos = (((idx_ni->last_set_bit + 1) <<
4973			idx_ni->block_size_shift) + vol->cluster_size_mask) &
4974			~(s64)vol->cluster_size_mask;
4975	lck_spin_lock(&idx_ni->size_lock);
4976	alloc_size = idx_ni->allocated_size;
4977	lck_spin_unlock(&idx_ni->size_lock);
4978	if (target_pos >= alloc_size) {
4979		ntfs_debug("Done (no space would be freed on disk by "
4980				"truncating the index allocation attribute).");
4981		goto out;
4982	}
4983	err = ntfs_attr_resize(idx_ni, target_pos, 0, NULL);
4984	if (err) {
4985		ntfs_debug("Failed to truncate index allocation attribute "
4986				"(error %d) thus this index will be wasting "
4987				"space on disk until it is deleted or "
4988				"repopulated with entries.", err);
4989		goto out;
4990	}
4991	ntfs_debug("Truncated index allocation attribute to reclaim 0x%llx "
4992			"bytes of disk space.",
4993			(unsigned long long)(alloc_size - target_pos));
4994	/*
4995	 * If the bitmap attribute is non-resident check if we would save disk
4996	 * space by truncating it, too, and if so do it.  Again we ignore any
4997	 * errors as it is ok for the bitmap attribute to be left as it is.
4998	 */
4999	if (NInoNonResident(bmp_ni)) {
5000		target_pos = ((((target_pos >> idx_ni->block_size_shift) +
5001				7) >> 3) + vol->cluster_size_mask) &
5002				~(s64)vol->cluster_size_mask;
5003		lck_spin_lock(&bmp_ni->size_lock);
5004		alloc_size = bmp_ni->allocated_size;
5005		lck_spin_unlock(&bmp_ni->size_lock);
5006		if (target_pos >= alloc_size) {
5007			ntfs_debug("Done (truncated index allocation to free "
5008					"space on disk but not truncating "
5009					"index bitmap as no space would be "
5010					"freed on disk by doing so).");
5011			goto out;
5012		}
5013		err = ntfs_attr_resize(bmp_ni, target_pos, 0, NULL);
5014		if (err) {
5015			ntfs_debug("Failed to truncate index bitmap attribute "
5016					"(error %d) thus this index will be "
5017					"wasting space on disk until it is "
5018					"deleted or repopulated with entries.",
5019					err);
5020			goto out;
5021		}
5022		ntfs_debug("Truncated index bitmap attribute to reclaim "
5023				"0x%llx bytes of disk space.",
5024				(unsigned long long)(alloc_size - target_pos));
5025	}
5026	ntfs_debug("Done.");
5027	goto out;
5028}
5029
5030/**
5031 * ntfs_index_make_empty - make an index empty discarding all entries in it
5032 * @ictx:	index context describing the index to empty
5033 *
5034 * Empty the index described by the index context @ictx.  On failure, use the
5035 * tree described by @ictx to re-allocate the freed index blocks if any have
5036 * been freed at the point of failure.
5037 *
5038 * This is called when the last index entry in an index is being deleted.
5039 *
5040 * We need to remove the sub-node from the end entry of the index root and
5041 * switch the entry to be a leaf entry.
5042 *
5043 * We also need to deallocate all index blocks and if possible shrink the index
5044 * allocation attribute to zero size as well as the index bitmap attribute if
5045 * it is non-resident.
5046 *
5047 * Return 0 on success and errno on error.
5048 *
5049 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for
5050 *	      writing.
5051 *	    - All of the index contexts in the index must be unlocked (this
5052 *	      includes @ictx, i.e. @ictx must not be locked).
5053 */
5054static errno_t ntfs_index_make_empty(ntfs_index_context *ictx)
5055{
5056	s64 data_size;
5057	ntfs_inode *bmp_ni, *idx_ni = ictx->idx_ni;
5058	MFT_RECORD *m;
5059	ntfs_attr_search_ctx *actx;
5060	INDEX_ROOT *ir;
5061	INDEX_HEADER *ih;
5062	INDEX_ENTRY *ie;
5063	ntfs_index_context *start_ictx;
5064	u32 new_ilen;
5065	errno_t err;
5066
5067	ntfs_debug("Entering.");
5068	if (ictx->is_locked)
5069		panic("%s(): ictx->is_locked\n", __FUNCTION__);
5070	/*
5071	 * Start by zeroing the index bitmap bits corresponding to the index
5072	 * blocks being deallocated.  For simplicity, we just zero the entire
5073	 * index bitmap attribute.
5074	 */
5075	err = ntfs_attr_inode_get(ictx->base_ni, AT_BITMAP, idx_ni->name,
5076			idx_ni->name_len, FALSE, LCK_RW_TYPE_EXCLUSIVE,
5077			&bmp_ni);
5078	if (err) {
5079		ntfs_error(idx_ni->vol->mp, "Failed to get index bitmap inode "
5080				"(error %d).", err);
5081		return err;
5082	}
5083	lck_spin_lock(&bmp_ni->size_lock);
5084	data_size = bmp_ni->data_size;
5085	lck_spin_unlock(&bmp_ni->size_lock);
5086	err = ntfs_attr_set(bmp_ni, 0, data_size, 0);
5087	if (err) {
5088		ntfs_error(idx_ni->vol->mp, "Failed to deallocate index "
5089				"block(s) in index bitmap.");
5090		goto err;
5091	}
5092	/*
5093	 * We need to get hold of the index root attribute in order to convert
5094	 * its end entry to a leaf node.
5095	 */
5096	err = ntfs_mft_record_map(ictx->base_ni, &m);
5097	if (err) {
5098		ntfs_error(idx_ni->vol->mp, "ntfs_mft_record_map() failed "
5099				"(error %d).", err);
5100		goto err;
5101	}
5102	actx = ntfs_attr_search_ctx_get(ictx->base_ni, m);
5103	if (!actx) {
5104		err = ENOMEM;
5105		goto unm_err;
5106	}
5107	/* Find the index root attribute in the mft record. */
5108	err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len,
5109			0, NULL, 0, actx);
5110	if (err) {
5111		if (err == ENOENT) {
5112			ntfs_error(idx_ni->vol->mp, "Index root attribute "
5113					"missing in inode 0x%llx.  Run "
5114					"chkdsk.",
5115					(unsigned long long)idx_ni->mft_no);
5116			err = EIO;
5117			NVolSetErrors(idx_ni->vol);
5118		}
5119		goto put_err;
5120	}
5121	/* Get to the index root value. */
5122	ir = (INDEX_ROOT*)((u8*)actx->a + le16_to_cpu(actx->a->value_offset));
5123	ih = (INDEX_HEADER*)&ir->index;
5124	/* The first and last index entry. */
5125	ie = (INDEX_ENTRY*)((u8*)ih + le32_to_cpu(ih->entries_offset));
5126	if (!(ie->flags & INDEX_ENTRY_END))
5127		panic("%s(): !(ie->flags & INDEX_ENTRY_END)\n", __FUNCTION__);
5128	if (!(ie->flags & INDEX_ENTRY_NODE))
5129		panic("%s(): !(ie->flags & INDEX_ENTRY_NODE)\n", __FUNCTION__);
5130	/*
5131	 * Remove the sub-node pointer from the index entry and shrink the
5132	 * index root attribute appropriately.
5133	 */
5134	ie->length = cpu_to_le16(le16_to_cpu(ie->length) - sizeof(VCN));
5135	ie->flags &= ~INDEX_ENTRY_NODE;
5136	new_ilen = le32_to_cpu(ih->index_length) - sizeof(VCN);
5137	ih->allocated_size = ih->index_length = cpu_to_le32(new_ilen);
5138	ih->flags &= ~LARGE_INDEX;
5139	err = ntfs_resident_attr_value_resize(actx->m, actx->a,
5140			offsetof(INDEX_ROOT, index) + new_ilen);
5141	/* We are shrinking the index root so the resize cannot fail. */
5142	if (err)
5143		panic("%s(): err\n", __FUNCTION__);
5144	/* Ensure the changes are written to disk. */
5145	NInoSetMrecNeedsDirtying(actx->ni);
5146	ntfs_attr_search_ctx_put(actx);
5147	ntfs_mft_record_unmap(ictx->base_ni);
5148	/*
5149	 * Now deal with the index allocation attribute by truncating it to
5150	 * zero length.
5151	 *
5152	 * Note we ignore any errors because it is not essential to resize the
5153	 * index allocation attribute.  In fact Windows and chkdsk are
5154	 * perfectly happy with it remaining allocated.  It just means the
5155	 * index is wasting space on disk and that will be reclaimed when the
5156	 * index is deleted or when the index is filled again with entries.
5157	 */
5158	err = ntfs_attr_resize(idx_ni, 0, 0, ictx);
5159	if (err)
5160		ntfs_debug("Failed to truncate index allocation attribute to "
5161				"zero size thus this index will be wasting "
5162				"space on disk until it is deleted or "
5163				"repopulated with entries.");
5164	/*
5165	 * Finally, if the index bitmap attribute is non-resident truncate it
5166	 * to zero length, too.  Again we ignore any errors as it is ok for the
5167	 * bitmap attribute to be left as it is.
5168	 */
5169	if (!err && NInoNonResident(bmp_ni)) {
5170		err = ntfs_attr_resize(bmp_ni, 0, 0, ictx);
5171		if (err)
5172			ntfs_debug("Failed to truncate index bitmap attribute "
5173					"to zero size thus this index will be "
5174					"wasting space on disk until it is "
5175					"deleted or repopulated with "
5176					"entries.");
5177	}
5178	lck_rw_unlock_exclusive(&bmp_ni->lock);
5179	(void)vnode_put(bmp_ni->vn);
5180	/*
5181	 * We no longer have any index blocks allocated so invalidate our cache
5182	 * of the last set bit.
5183	 */
5184	idx_ni->last_set_bit = -1;
5185	ntfs_debug("Done.");
5186	return 0;
5187put_err:
5188	ntfs_attr_search_ctx_put(actx);
5189unm_err:
5190	ntfs_mft_record_unmap(ictx->base_ni);
5191err:
5192	/*
5193	 * Re-allocate the deallocated index block(s).  This is safe because
5194	 * the index inode mutex is held throughout.
5195	 */
5196	start_ictx = ictx;
5197	do {
5198		int err2;
5199
5200		 /*
5201		  * Skip the index root as it does not have a bit in the
5202		  * bitmap.
5203		  */
5204		if (ictx->is_root)
5205			continue;
5206		err2 = ntfs_bitmap_set_bit(bmp_ni, ictx->vcn <<
5207				idx_ni->vcn_size_shift >>
5208				idx_ni->block_size_shift);
5209		if (err2) {
5210			ntfs_error(idx_ni->vol->mp, "Failed to undo "
5211					"deallocation of index block in index "
5212					"bitmap (error %d) of inode 0x%llx.  "
5213					"Leaving inconsistent metadata.  Run "
5214					"chkdsk.", err2,
5215					(unsigned long long)idx_ni->mft_no);
5216			NVolSetErrors(idx_ni->vol);
5217		}
5218	} while ((ictx = ictx->up) != start_ictx);
5219	lck_rw_unlock_exclusive(&bmp_ni->lock);
5220	(void)vnode_put(bmp_ni->vn);
5221	ntfs_error(idx_ni->vol->mp, "Failed (error %d).", err);
5222	return err;
5223}
5224
5225/**
5226 * ntfs_index_entry_delete - delete an index entry
5227 * @ictx:	index context describing the index entry to delete
5228 *
5229 * Delete the index entry described by the index context @ictx from the index.
5230 *
5231 * Return 0 on success and errno on error.  A special case is a return code of
5232 * -EAGAIN (negative, not EAGAIN) which means that the B+tree was rearranged
5233 * into a different consistent state to make the deletion possible but now the
5234 * lookup and delete has to be repeated as the index entry to be deleted may
5235 * have changed its position in the tree thus a new lookup is required to be
5236 * able to delete it.  Doing this is not terribly efficient but we are only
5237 * talking about a handful of cases in a single delete of tens of thousands of
5238 * files so it does not matter if that is inefficient.  On the plus side doing
5239 * things this way means we do not need to keep track of the entry to be
5240 * deleted when rearranging the tree which saves time and makes the code much
5241 * simpler.
5242 *
5243 * Note that we do not update the array of index entry pointers nor the number
5244 * of entries in the array because on success it means that the index entry has
5245 * been removed and the function is done, i.e. the array of index entry
5246 * pointers will not be used any more and on error the index contect becomes
5247 * invalid so there is no need to update any of the pointers.
5248 *
5249 * Locking: - Caller must hold @ictx->idx_ni->lock on the index inode for
5250 *	      writing.
5251 *	    - The index context @ictx must be locked.
5252 */
5253int ntfs_index_entry_delete(ntfs_index_context *ictx)
5254{
5255	leVCN vcn;
5256	ntfs_inode *idx_ni = ictx->idx_ni;
5257	ntfs_index_context *pred_ictx, *parent_ictx, *suc_ictx, *put_ictx;
5258	ntfs_index_context *deallocate_ictx;
5259	INDEX_HEADER *ih;
5260	INDEX_ENTRY *ie, *next_ie, *pull_down_entry;
5261	MFT_RECORD *m;
5262	unsigned ie_size, pred_ie_size, pull_down_entry_nr, pull_down_ie_size;
5263	unsigned old_parent_entry_nr, old_parent_ie_size, move_ie_size;
5264	errno_t err;
5265	u32 new_ilen;
5266	BOOL is_root = ictx->is_root;
5267
5268	ntfs_debug("Entering.");
5269	if (!ictx->is_locked)
5270		panic("%s(): !ictx->is_locked\n", __FUNCTION__);
5271	deallocate_ictx = parent_ictx = put_ictx = NULL;
5272	ih = ictx->index;
5273	ie = ictx->entry;
5274	/* We cannot delete end entries as they do not contain any key/data. */
5275	if (ie->flags & INDEX_ENTRY_END)
5276		panic("%s(): ie->flags & INDEX_ENTRY_END\n", __FUNCTION__);
5277	ie_size = le16_to_cpu(ie->length);
5278	next_ie = (INDEX_ENTRY*)((u8*)ie + ie_size);
5279	/*
5280	 * There are two types of index entry: Either it is a leaf entry, i.e.
5281	 * it is in a leaf node and thus has no children or it is in an index
5282	 * node entry, i.e. it is in an index block node and thus has children
5283	 * which we need to deal with.  If it is a leaf entry, skip onto leaf
5284	 * entry deletion.
5285	 */
5286	if (!(ie->flags & INDEX_ENTRY_NODE))
5287		goto delete_leaf_entry;
5288	/*
5289	 * The node the entry to be deleted is in has sub-node(s), i.e. it is
5290	 * not a leaf node, and the entry to be deleted is not a leaf entry.
5291	 * We have to replace it with its predecessor (which by definition is a
5292	 * leaf entry).  There are three cases of increasing complexity:
5293	 *
5294	 * 1) The simplest case is when the predecessor is not the only entry
5295	 * in its (leaf) node thus it can be simply removed without any tree
5296	 * rebalancing needing to be done on its node and in addition the
5297	 * predecessor entry can replace the entry to be deleted without
5298	 * overflowing the node, i.e. there is enough space in the node we are
5299	 * deleting the entry from to fit its predecessor entry thus it is a
5300	 * matter of a simple replace without the need to change anything else
5301	 * in the tree.
5302	 *
5303	 * 2) The slightly more complicated case is the same as above case 1)
5304	 * except that there is not enough space in the node the entry to be
5305	 * deleted is in to fit its predecessor entry, thus we need to split
5306	 * the node the entry is being deleted from and promote the median
5307	 * entry to the parent node which may then overflow the parent node and
5308	 * so on up to the root node.  A particular annoyance here is that
5309	 * depending on the implementation details it is possible for the entry
5310	 * to be deleted to be the median entry and thus be promoted to its
5311	 * parent or alternatively it is possible for its predecessor entry
5312	 * that is to replace the entry to be deleted to have to be promoted.
5313	 * A further pitfall is that if the entry to be deleted is behind the
5314	 * median entry, i.e. it is on the right of the median entry, then it
5315	 * will be moved to a different node (the new right-hand node to the
5316	 * old node being split) thus the replace needs to happen in a
5317	 * different place.  If we use ntfs_index_entry_add() as it is to take
5318	 * care of the split&promote on insertion this last case would cause us
5319	 * the problem that the pointers would be all out of date.  We would
5320	 * need ntfs_index_entry_add() to update the pointers and to switch the
5321	 * right and left nodes in that case.  Perhaps we need an
5322	 * ntfs_index_entry_replace() that can share a lot of code with
5323	 * ntfs_index_entry_add() perhaps even just a flag to
5324	 * ntfs_index_entry_add() to indicate replace?  Then we would not need
5325	 * to worry about the pointers becoming out of date as the replace
5326	 * would be done already for us.
5327	 *
5328	 * 3) The more complicated case is when the predecessor entry is the
5329	 * last entry in its (leaf) node thus we cannot use it to replace the
5330	 * entry to be deleted without rebalancing the tree.  In this case we
5331	 * have to rebalance the tree so that the predecessor entry is no
5332	 * longer the last entry in its (leaf) node.  Once that is successfully
5333	 * done we have reduced the problem to either case 1) or case 2) above
5334	 * or in fact to a completely different case (see below).  The benefit
5335	 * of doing the rebalancing first without regard to the delete and
5336	 * replace that are to take place is that the tree ends up in a
5337	 * consistent state, just a different one to what it was before, thus
5338	 * we do not need to rollback to the original state any more thus error
5339	 * handling is greatly simplified.  The pitfall here is that doing the
5340	 * balancing may profoundly change the tree to the extent that the
5341	 * entry to be deleted may be moved to somewhere completely different
5342	 * which we somehow need to be able to cope with.  This can have
5343	 * positive side effects, too, as it for example can lead to the entry
5344	 * to be deleted being turned into a leaf entry thus its deletion turns
5345	 * into a delete of a leaf entry thus the planned replace does not need
5346	 * to happen at all.  This is the "completely different case" mentioned
5347	 * above.  Usually this case will have been caused by a merge of two
5348	 * neighbouring nodes with pulldown of the parent entry (which happens
5349	 * to be the entry to be deleted) thus the entry to be deleted will not
5350	 * only be a leaf entry but it will also not be the last entry in the
5351	 * leaf node thus it can be deleted with a simple delete.  Otherwise it
5352	 * is not a disaster and we just need to rebalance the tree again as we
5353	 * did just now but this time making sure that the entry to be deleted
5354	 * is not the only entry in the node (compare to above where we were
5355	 * making sure that the leaf predecessor entry is not the only entry in
5356	 * its leaf node) and then we can proceed with a simple delete of the
5357	 * leaf node.  Because the tree is balanced at all steps we do not
5358	 * actually care about handling the delete of the entry in the case
5359	 * that it turned into a leaf entry and we just leave it to the leaf
5360	 * entry deletion code to deal with.
5361	 *
5362	 * That is quite enough description now, let the games begin.  First,
5363	 * investigate the index entry to be deleted and its predecessor to
5364	 * determine which of the above categories the deletion falls into.
5365	 *
5366	 * Locate the predecessor entry.  Because the predecessor may be the
5367	 * only entry in its (leaf) node in which case it would cause the tree
5368	 * to be rebalanced, we record the path taken to the predecessor entry
5369	 * by simply extending the existing tree path that points to the entry
5370	 * to be deleted.
5371	 */
5372	err = ntfs_index_lookup_predecessor(ictx, &pred_ictx);
5373	if (err) {
5374		ntfs_error(idx_ni->vol->mp, "Failed to look up predecessor of "
5375				"index node entry to be deleted (error %d).",
5376				err);
5377		return err;
5378	}
5379	/*
5380	 * If the predecessor is not the only entry other than the end entry in
5381	 * its node we can simply remove it from its node and use it to replace
5382	 * the entry to be deleted.
5383	 */
5384	if (pred_ictx->nr_entries > 2)
5385		goto simple_replace;
5386	if (pred_ictx->nr_entries < 2)
5387		panic("%s(): pred_ictx->nr_entries < 2\n", __FUNCTION__);
5388	/*
5389	 * The predecessor is the only entry (other than the end entry) in its
5390	 * node thus we cannot simply remove it from its node as that would
5391	 * cause the B+tree to become imbalanced.
5392	 *
5393	 * There are two different cases we need to consider for which we need
5394	 * to look up the predecessor of the current predecessor leaf entry.
5395	 * To do this we ascend the tree looking for a node with entries in it.
5396	 * The first node containing any entries is the node containing the
5397	 * predecessor entry (which is not a leaf entry).
5398	 *
5399	 * 1) If we reach the original node containing the entry to be deleted
5400	 * without finding any non-empty nodes on the way the whole sub-tree
5401	 * below the entry to be deleted will become empty after using the
5402	 * predecessor leaf entry to replace the entry to be deleted.  Thus we
5403	 * can deallocate all nodes in this sub-tree and instead of replacing
5404	 * the entry to be deleted we remove it altogether and we then move the
5405	 * predecessor leaf entry that is now homeless in front of its new
5406	 * successor leaf entry, thus effectively merging the node containing
5407	 * the predecessor with its right-hand sibling.
5408	 *
5409	 * Note that the insertion of the predecessor leaf entry into the node
5410	 * containing its successor leaf entry may require a split of the node
5411	 * we are insert into.
5412	 *
5413	 * 2) If we find a predecessor for the predecessor entry before we
5414	 * reach the original node containing the entry to be deleted we break
5415	 * what we need to do into two operations.  First, we rebalance the
5416	 * tree so that the predecessor entry of the entry to be deleted is no
5417	 * longer the only entry in its leaf node.  This simplifies the
5418	 * deletion of the entry to a case we have already solved as we can now
5419	 * simply remove the predecessor entry from its node and use it to
5420	 * replace the entry to be deleted.  Thus, second, we use the already
5421	 * existing code to do a simple replace.  Should that fail it does not
5422	 * matter as we are leaving a fully consistent B+tree tree behind.
5423	 * Breaking this case up into two means we incur a little bit of
5424	 * overhead for the extra locking of the predecessor node and for the
5425	 * extra memmove()s and memcpy() of the predecessor entry.  But this
5426	 * simplifies error handling and makes the code so much simpler that it
5427	 * is definitely worth paying the price in overhead.
5428	 *
5429	 * Enough discussion, time to do it.  Start by going up the tree
5430	 * looking for a non-empty node or the original node containing the
5431	 * entry to be deleted so we can determine if we have the above case 1
5432	 * or 2.
5433	 *
5434	 * We must be at the bottom of the current tree path or things will get
5435	 * very confused.
5436	 */
5437	if (!pred_ictx->down->is_root)
5438		panic("%s(): !pred_ictx->down->is_root\n", __FUNCTION__);
5439	parent_ictx = pred_ictx->up;
5440	put_ictx = deallocate_ictx = pred_ictx;
5441	ntfs_index_ctx_disconnect_reinit(pred_ictx);
5442	/*
5443	 * Make a note of the size of the predecessor entry that we are going
5444	 * to move and unlock it for now.
5445	 */
5446	move_ie_size = le16_to_cpu(pred_ictx->entry->length);
5447	ntfs_index_ctx_unlock(pred_ictx);
5448	/* Now ascend the tree until we find a non-empty node. */
5449	while (parent_ictx->nr_entries <= 1) {
5450		if (parent_ictx->nr_entries != 1)
5451			panic("%s(): parent_ictx->nr_entries != 1\n",
5452					__FUNCTION__);
5453		/*
5454		 * Empty index node.  Move it to the list of index nodes to be
5455		 * deallocated and proceed to its parent.
5456		 */
5457		pred_ictx = parent_ictx;
5458		parent_ictx = parent_ictx->up;
5459		ntfs_index_ctx_move(pred_ictx, deallocate_ictx);
5460	}
5461	/*
5462	 * We have a non-empty parent node.  Check whether this is the node
5463	 * containing the entry to be deleted (case 1) or whether it is an
5464	 * intervening node (case 2).  In the latter case we need to rearrange
5465	 * the tree.
5466	 */
5467	if (parent_ictx != ictx)
5468		goto rearrange_tree;
5469	/*
5470	 * Now that we know we have case 1, we need to find the leaf node
5471	 * containing the successor entry of the entry to be deleted.  To do
5472	 * this we repoint the tree path to the entry on the right-hand side of
5473	 * the entry to be deleted and descend down into the first entry of
5474	 * each node until we reach the leaf-node.
5475	 *
5476	 * We need to lock the node containing the entry to be deleted and make
5477	 * a note of it as we are going to delete it later.
5478	 */
5479	err = ntfs_index_ctx_relock(parent_ictx);
5480	if (err) {
5481		ntfs_error(idx_ni->vol->mp, "Failed to relock the parent "
5482				"index node (error %d).", err);
5483		goto put_err;
5484	}
5485	/*
5486	 * NOTE: pull_down_* == to be deleted (we are just reusing existing
5487	 * code and variables hence the (mis)naming...
5488	 */
5489	pull_down_entry = parent_ictx->follow_entry;
5490	pull_down_entry_nr = parent_ictx->entry_nr;
5491	/*
5492	 * Repoint the path to the entry on the right-hand side of the entry to
5493	 * be deleted so we can descend to the leaf node containing the
5494	 * successor entry.
5495	 */
5496	parent_ictx->follow_entry = (INDEX_ENTRY*)((u8*)pull_down_entry +
5497			le16_to_cpu(pull_down_entry->length));
5498	parent_ictx->entry_nr++;
5499	/* Finally descend to the leaf node containing the successor entry. */
5500	suc_ictx = parent_ictx;
5501	do {
5502		err = ntfs_index_descend_into_child_node(&suc_ictx);
5503		if (err) {
5504			ntfs_error(idx_ni->vol->mp, "Failed to descend to "
5505					"node containing successor entry "
5506					"(error %d).", err);
5507			goto put_err;
5508		}
5509		if (!suc_ictx->is_locked)
5510			panic("%s(): !suc_ictx->is_locked\n", __FUNCTION__);
5511		suc_ictx->follow_entry = suc_ictx->entries[0];
5512	} while (suc_ictx->index->flags & INDEX_NODE);
5513	if (suc_ictx->follow_entry->flags & INDEX_ENTRY_NODE)
5514		panic("%s(): suc_ictx->follow_entry->flags & "
5515				"INDEX_ENTRY_NODE\n", __FUNCTION__);
5516	/*
5517	 * Can the predecessor entry simply be inserted in front of the
5518	 * successor leaf entry without overflowing the node?  If not we need
5519	 * to split the node and then restart the delete.
5520	 */
5521	if (move_ie_size > suc_ictx->bytes_free) {
5522split_and_restart_case_1:
5523		ntfs_debug("Splitting index node before add on delete (case "
5524				"1).");
5525		ie_size = move_ie_size;
5526		goto split_and_restart;
5527	}
5528	/*
5529	 * The predecessor entry can simply be inserted in front of the
5530	 * successor leaf entry.
5531	 *
5532	 * We need to have locked both nodes to be able to do the insert thus
5533	 * we may need to unlock the locked successor node to ensure that the
5534	 * node lock is always taken in ascending page offset order so we avoid
5535	 * deadlocks.  Also we have to consider the case where the two index
5536	 * nodes are in the same page in which case we need to share the index
5537	 * lock.
5538	 */
5539	if (deallocate_ictx->is_locked)
5540		panic("%s(): deallocate_ictx->is_locked\n", __FUNCTION__);
5541	err = ntfs_index_ctx_lock_two(suc_ictx, deallocate_ictx);
5542	if (err)
5543		goto put_err;
5544	/*
5545	 * We need to re-check the amount of free space as it can have changed
5546	 * whilst we dropped the lock on @suc_ictx if that was necessary.  When
5547	 * this happens simply drop the lock on @deallocate_ictx if we took it
5548	 * and go back to the previous code dealing with the not enough space
5549	 * case.
5550	 */
5551	if (move_ie_size > suc_ictx->bytes_free) {
5552		if (deallocate_ictx->is_locked)
5553			ntfs_index_ctx_unlock(deallocate_ictx);
5554		goto split_and_restart_case_1;
5555	}
5556	/*
5557	 * Both nodes are locked and this is a simple insert, so go ahead and
5558	 * do it.  We have already checked that there is enough space so it
5559	 * cannot fail.
5560	 *
5561	 * Move all the entries out of the way to make space for the
5562	 * predecessor entry to be copied in.
5563	 */
5564	err = ntfs_index_node_make_space(suc_ictx, move_ie_size);
5565	if (err)
5566		panic("%s(): err\n", __FUNCTION__);
5567	/*
5568	 * Copy the predecessor index entry into the created space in front of
5569	 * the successor entry.
5570	 */
5571	memcpy(suc_ictx->follow_entry, deallocate_ictx->entry, move_ie_size);
5572	/* Ensure the updates are written to disk. */
5573	ntfs_index_entry_mark_dirty(suc_ictx);
5574	/*
5575	 * We are done both with the predecessor and successor nodes so release
5576	 * their locks.
5577	 */
5578	if (deallocate_ictx->is_locked)
5579		ntfs_index_ctx_unlock(deallocate_ictx);
5580	if (!suc_ictx->is_locked)
5581		panic("%s(): !suc_ictx->is_locked\n", __FUNCTION__);
5582	ntfs_index_ctx_unlock(suc_ictx);
5583	/*
5584	 * We now have to lock the original node containing the entry to be
5585	 * deleted and we need to switch it to be the current index context and
5586	 * to point to the entry to be deleted.  Note this is a simple delete
5587	 * just like for a leaf entry as we have taken care of its child leaf
5588	 * node(s) already and will deallocate it(them) later.
5589	 */
5590	if (parent_ictx->is_locked)
5591		panic("%s(): parent_ictx->is_locked\n", __FUNCTION__);
5592	err = ntfs_index_ctx_relock(parent_ictx);
5593	if (err) {
5594		ntfs_error(idx_ni->vol->mp, "Failed to relock the parent "
5595				"index node (error %d).", err);
5596		/*
5597		 * Undo the insertion of the predecessor entry in front of the
5598		 * successor entry by moving the successor and all following
5599		 * entries back into their old place and resetting the index
5600		 * length to the old size.
5601		 */
5602		err = ntfs_index_ctx_relock(suc_ictx);
5603		if (err) {
5604			ntfs_error(idx_ni->vol->mp, "Failed to relock index "
5605					"context (error %d).  Leaving corrupt "
5606					"index.  Run chkdsk.", err);
5607			NVolSetErrors(idx_ni->vol);
5608		} else {
5609			/*
5610			 * @suc_ictx cannot be the index root or the below
5611			 * code would be wrong.
5612			 */
5613			if (suc_ictx->is_root)
5614				panic("%s(): suc_ctx->is_root\n", __FUNCTION__);
5615			ie = suc_ictx->follow_entry;
5616			ie_size = le16_to_cpu(ie->length);
5617			next_ie = (INDEX_ENTRY*)((u8*)ie + ie_size);
5618			ih = suc_ictx->index;
5619			new_ilen = le32_to_cpu(ih->index_length);
5620			memmove(ie, next_ie, new_ilen - ((u8*)next_ie -
5621					(u8*)ih));
5622			ih->index_length = cpu_to_le32(new_ilen - ie_size);
5623			ntfs_index_entry_mark_dirty(suc_ictx);
5624		}
5625		goto put_err;
5626	}
5627	ictx = parent_ictx;
5628	parent_ictx->entry = parent_ictx->entries[pull_down_entry_nr];
5629	parent_ictx->entry_nr = pull_down_entry_nr;
5630	goto prep_simple_delete;
5631rearrange_tree:
5632	/*
5633	 * We now know we have case 2, i.e. we have a non-empty, intervening
5634	 * node containing the predecessor (non-leaf) entry of the predecessor
5635	 * entry of the entry to be deleted and we need to use it to rearrange
5636	 * the tree such that the (leaf) predecessor entry of the entry to be
5637	 * deleted no longer is the only entry in its (leaf) node.
5638	 *
5639	 * To do this we merge the (leaf) node containing the predecessor entry
5640	 * of the entry to be deleted with its left-hand sibling (leaf) node.
5641	 * This involves pulling the (non-leaf) predecessor entry of the (leaf)
5642	 * predecessor entry of the entry to be deleted down to behind its
5643	 * (leaf) predecessor entry.
5644	 *
5645	 * Thus, we need to now locate the leaf node containing the predecessor
5646	 * entry behind which we need to insert the two predecessor entries.
5647	 *
5648	 * To be able to do this we need to lock the node containing the (non-
5649	 * leaf) predecessor of the predecessor of the entry to be deleted and
5650	 * make a note of it as we are going to delete it later.  We also need
5651	 * to repoint the tree path to descend into the (non-leaf) predecessor
5652	 * entry.
5653	 */
5654	if (parent_ictx->is_locked)
5655		panic("%s(): parent_ictx->is_locked\n", __FUNCTION__);
5656	if (parent_ictx->is_root)
5657		panic("%s(): parent_ictx->is_root\n", __FUNCTION__);
5658	err = ntfs_index_ctx_relock(parent_ictx);
5659	if (err)
5660		goto put_err;
5661	parent_ictx->entry_nr--;
5662	parent_ictx->follow_entry = parent_ictx->entries[parent_ictx->entry_nr];
5663	pull_down_ie_size = le16_to_cpu(parent_ictx->follow_entry->length) -
5664			sizeof(VCN);
5665	/* Now descend to the leaf predecessor entry. */
5666	suc_ictx = parent_ictx;
5667	do {
5668		err = ntfs_index_descend_into_child_node(&suc_ictx);
5669		if (err) {
5670			ntfs_error(idx_ni->vol->mp, "Failed to descend to "
5671					"node containing leaf predecessor "
5672					"entry of non-leaf predecessor entry "
5673					"of leaf predecessor entry of entry "
5674					"to be deleted (error %d).", err);
5675			goto put_err;
5676		}
5677		if (!suc_ictx->is_locked)
5678			panic("%s(): !suc_ictx->is_locked\n", __FUNCTION__);
5679		suc_ictx->entry_nr = suc_ictx->nr_entries - 1;
5680		suc_ictx->follow_entry = suc_ictx->entries[suc_ictx->entry_nr];
5681	} while (suc_ictx->index->flags & INDEX_NODE);
5682	if (suc_ictx->follow_entry->flags & INDEX_ENTRY_NODE)
5683		panic("%s(): suc_ictx->follow_entry->flags & "
5684				"INDEX_ENTRY_NODE\n", __FUNCTION__);
5685	/*
5686	 * Can the entry to be pulled down and the original predecessor entry
5687	 * to be merged in simply be inserted after the located predecessor
5688	 * entry without overflowing the node?  If not we need to split the
5689	 * node and then restart the delete.
5690	 */
5691	if (pull_down_ie_size + move_ie_size > suc_ictx->bytes_free) {
5692split_and_restart_case_2:
5693		ntfs_debug("Splitting index node before add on delete (case "
5694				"2).");
5695		ie_size = pull_down_ie_size + move_ie_size;
5696		goto split_and_restart;
5697	}
5698	/*
5699	 * We know we have enough space so we cannot fail.  Need to lock the
5700	 * non-leaf predecessor that we want to pull down.
5701	 */
5702	if (parent_ictx->is_locked)
5703		panic("%s(): parent_ictx->is_locked\n", __FUNCTION__);
5704	err = ntfs_index_ctx_lock_two(suc_ictx, parent_ictx);
5705	if (err)
5706		goto put_err;
5707	/*
5708	 * We need to re-check the amount of free space as it can have changed
5709	 * whilst we dropped the lock on @suc_ictx if that was necessary.  When
5710	 * this happens simply drop the lock on @parent_ictx if we took it and
5711	 * go back to the previous code dealing with the not enough space case.
5712	 */
5713	if (pull_down_ie_size + move_ie_size > suc_ictx->bytes_free) {
5714		if (parent_ictx->is_locked)
5715			ntfs_index_ctx_unlock(parent_ictx);
5716		goto split_and_restart_case_2;
5717	}
5718	/*
5719	 * Make enough space in the destination leaf node to accomodate both
5720	 * the entries.
5721	 */
5722	err = ntfs_index_node_make_space(suc_ictx, pull_down_ie_size +
5723			move_ie_size);
5724	if (err)
5725		panic("%s(): err\n", __FUNCTION__);
5726	/*
5727	 * Copy the non-leaf predecessor entry into place and convert it to a
5728	 * leaf entry.
5729	 */
5730	ie = suc_ictx->follow_entry;
5731	pull_down_entry = parent_ictx->follow_entry;
5732	memcpy(ie, pull_down_entry, pull_down_ie_size);
5733	ie->length = cpu_to_le16(pull_down_ie_size);
5734	ie->flags &= ~INDEX_ENTRY_NODE;
5735	/*
5736	 * Now delete the entry from its original node and transfer its VCN
5737	 * sub-node pointer to the next entry (which is the end entry).
5738	 */
5739	vcn = *(leVCN*)((u8*)pull_down_entry + pull_down_ie_size);
5740	ih = parent_ictx->index;
5741	new_ilen = le32_to_cpu(ih->index_length) - (pull_down_ie_size +
5742			sizeof(VCN));
5743	memmove(pull_down_entry, (u8*)pull_down_entry + pull_down_ie_size +
5744			sizeof(VCN), new_ilen - ((u8*)pull_down_entry -
5745			(u8*)ih));
5746	*(leVCN*)((u8*)pull_down_entry + le16_to_cpu(pull_down_entry->length) -
5747			sizeof(VCN)) = vcn;
5748	/* Update the index size, too. */
5749	ih->index_length = cpu_to_le32(new_ilen);
5750	/* Ensure the updates are written to disk. */
5751	ntfs_index_entry_mark_dirty(parent_ictx);
5752	/* Update the index context as well. */
5753	parent_ictx->bytes_free += pull_down_ie_size + sizeof(VCN);
5754	parent_ictx->nr_entries--;
5755	/* We are done with the non-leaf predecessor node so unlock it. */
5756	if (parent_ictx->is_locked)
5757		ntfs_index_ctx_unlock(parent_ictx);
5758	/*
5759	 * Now need to move over the original leaf predecessor entry.  Lock its
5760	 * node again.
5761	 */
5762	if (deallocate_ictx->is_locked)
5763		panic("%s(): deallocate_ictx->is_locked\n", __FUNCTION__);
5764	err = ntfs_index_ctx_lock_two(suc_ictx, deallocate_ictx);
5765	if (err)
5766		goto put_err;
5767	/* Copy the original leaf predecessor entry into place. */
5768	memcpy((u8*)suc_ictx->follow_entry + pull_down_ie_size,
5769			deallocate_ictx->follow_entry, move_ie_size);
5770	/* Ensure the updates are written to disk. */
5771	ntfs_index_entry_mark_dirty(suc_ictx);
5772	/* We are finished rearranging the tree so unlock both nodes. */
5773	ntfs_index_ctx_unlock(suc_ictx);
5774	if (deallocate_ictx->is_locked)
5775		ntfs_index_ctx_unlock(deallocate_ictx);
5776	/*
5777	 * The last thing to do is to deallocate the disconnected index node(s)
5778	 * and then we can restart the delete operation which now involves a
5779	 * simple replace to complete the delete.
5780	 */
5781	parent_ictx = deallocate_ictx;
5782	do {
5783		if (parent_ictx->is_root)
5784			panic("%s(): parent_ictx->is_root\n", __FUNCTION__);
5785		err = ntfs_index_block_free(parent_ictx);
5786		if (err) {
5787			ntfs_error(idx_ni->vol->mp, "Failed to deallocate no "
5788					"longer used index block vcn 0x%llx "
5789					"(error %d).  Run chkdsk to recover "
5790					"the lost index block.",
5791					(unsigned long long)parent_ictx->vcn,
5792					err);
5793			NVolSetErrors(idx_ni->vol);
5794		}
5795	} while ((parent_ictx = parent_ictx->up) != deallocate_ictx);
5796	/*
5797	 * Release the sub-tree paths that the caller no longer has a reference
5798	 * to.
5799	 */
5800	ntfs_index_ctx_put(put_ictx);
5801	/* Reset the state machine to original values. */
5802	put_ictx = deallocate_ictx = NULL;
5803	/*
5804	 * The tree is now fully consistent, no nodes are locked, and @suc_ictx
5805	 * is the leaf node containing the predecessor entry of the entry to
5806	 * be deleted and the predecessor entry no longer is the only entry
5807	 * other than the end entry thus it can simply be used to replace the
5808	 * entry to be deleted.
5809	 *
5810	 * Setup everything so we can jump straight into the simple replace
5811	 * code.  Note we lock the original entry to be deleted as well so that
5812	 * the simple replace code does not have to drop the lock on the
5813	 * predecessor node in order to lock the original node which it may
5814	 * have to do otherwise.
5815	 *
5816	 * The simple replace code requires the predecessor to be in
5817	 * @pred_ictx.
5818	 */
5819	pred_ictx = suc_ictx;
5820	err = ntfs_index_ctx_lock_two(suc_ictx, ictx);
5821	if (err)
5822		return err;
5823	/*
5824	 * Repoint the predecessor context to point to the correct entry and
5825	 * update it to reflect the addition of the two index entries.
5826	 */
5827	pred_ictx->entry = (INDEX_ENTRY*)((u8*)pred_ictx->entry +
5828			pull_down_ie_size);
5829	pred_ictx->entry_nr++;
5830	pred_ictx->is_match = 1;
5831	pred_ictx->bytes_free -= pull_down_ie_size + move_ie_size;
5832	pred_ictx->nr_entries += 2;
5833	if (pred_ictx->nr_entries > pred_ictx->max_entries)
5834		panic("%s(): pred_ictx->nr_entries > pred_ictx->max_entries\n",
5835				__FUNCTION__);
5836	pred_ictx->entries[pred_ictx->entry_nr] = pred_ictx->entry;
5837	pred_ictx->entries[pred_ictx->entry_nr + 1] = (INDEX_ENTRY*)(
5838			(u8*)pred_ictx->entry + move_ie_size);
5839	/* Everything is set up.  Do the simple replace. */
5840	/* goto simple_replace; */
5841simple_replace:
5842	/*
5843	 * The predecessor can simply be removed from its node.
5844	 *
5845	 * We need to have locked both nodes to be able to do the replace thus
5846	 * we may need to unlock the locked predecessor node to ensure that
5847	 * page locks are only taken in ascending page offset order so we avoid
5848	 * deadlocks.
5849	 */
5850	if (!pred_ictx->is_locked)
5851		panic("%s(): !pred_ictx->is_locked\n", __FUNCTION__);
5852	if (!ictx->is_locked) {
5853		err = ntfs_index_ctx_lock_two(pred_ictx, ictx);
5854		if (err)
5855			return err;
5856	}
5857	/*
5858	 * Can the predecessor simply replace the entry to be deleted without
5859	 * overflowing the node containing the entry to be deleted?  If not we
5860	 * need to split the node and then restart the delete.
5861	 */
5862	pred_ie_size = le16_to_cpu(pred_ictx->entry->length);
5863	ie_size = le16_to_cpu(ictx->entry->length) - sizeof(VCN);
5864	if ((int)pred_ie_size - (int)ie_size > (int)ictx->bytes_free) {
5865		ntfs_debug("Splitting index node before add on delete (case "
5866				"3).");
5867		/*
5868		 * Drop the lock on the predecessor or transfer it to the node
5869		 * containing the entry to be deleted if they share the same
5870		 * page (in which case @ictx is not marked locked).
5871		 */
5872		if (!pred_ictx->is_locked)
5873			panic("%s(): !pred_ictx->is_locked\n", __FUNCTION__);
5874		if (ictx->is_locked) {
5875			/*
5876			 * @ictx is locked thus it cannot be sharing the lock
5877			 * with @pred_ictx.  Unlock @pred_ictx.
5878			 */
5879			ntfs_index_ctx_unlock(pred_ictx);
5880		} else {
5881			/*
5882			 * @ictx is not locked thus it must be sharing the lock
5883			 * with @pred_ictx.  Transfer the lock across.
5884			 */
5885			if (ictx->is_root)
5886				panic("%s(): ictx->is_root\n", __FUNCTION__);
5887			if (ictx->upl_ofs != pred_ictx->upl_ofs)
5888				panic("%s(): ictx->upl_ofs != "
5889						"pred_ictx->upl_ofs\n",
5890						__FUNCTION__);
5891			ictx->is_locked = 1;
5892			pred_ictx->is_locked = 0;
5893		}
5894		if (!ictx->is_locked)
5895			panic("%s(): !ictx->is_locked\n", __FUNCTION__);
5896		suc_ictx = ictx;
5897		ie_size = pred_ie_size - ie_size;
5898		goto split_and_restart;
5899	}
5900	/*
5901	 * Both nodes are locked and this is a simple replace, so go ahead and
5902	 * do it.  We have already checked that there is enough space so it
5903	 * cannot fail.
5904	 */
5905	err = ntfs_index_entry_replace(ictx, pred_ictx);
5906	if (err)
5907		panic("%s(): err\n", __FUNCTION__);
5908	/* We are done with the current entry so release it. */
5909	if (ictx->is_locked)
5910		ntfs_index_ctx_unlock(ictx);
5911	/*
5912	 * We have now replaced the entry to be deleted but at the moment we
5913	 * have two copies of the predecessor entry, so delete the original
5914	 * copy from its (leaf) node.  We need to switch the predecessor index
5915	 * context to be the current context so we can just pretend it is the
5916	 * entry to be deleted.  We only need to setup the fields relevant to
5917	 * index block node deletion as the predecessor cannot be in the index
5918	 * root node.
5919	 */
5920	ictx = pred_ictx;
5921	goto prep_simple_delete;
5922delete_leaf_entry:
5923	/*
5924	 * If the entry is in the index root or it is not the only entry (other
5925	 * than the end entry) in the leaf node it can simply be removed.
5926	 */
5927	if (ictx->nr_entries > 2 || is_root)
5928		goto simple_delete;
5929	/*
5930	 * The entry to be deleted is the only entry and it is not in the index
5931	 * root.  We have to rebalance the tree so that the entry to be deleted
5932	 * is no longer the only entry in its node.  There are several cases we
5933	 * need to consider:
5934	 *
5935	 * 1) If the parent entry is not the end entry of its index node then
5936	 *    simply insert our parent entry at the front of the leaf node on
5937	 *    our right, i.e. the child node of the entry following our parent
5938	 *    entry.  Then remove our parent entry from our parent node and
5939	 *    free the leaf node the entry to be deleted is in (thus deleting
5940	 *    the entry).
5941	 *
5942	 *    What the above describes is effectively merging the leaf node
5943	 *    containing the entry to be deleted with its right-hand sibling
5944	 *    leaf node and in the process pulling down our parent entry into
5945	 *    the merged node and placing it in front of its successor entry.
5946	 *
5947	 *    Our parent node is of course turned from an index node entry to a
5948	 *    leaf entry as it is pulled down.
5949	 *
5950	 *    Note how if our parent entry is the only entry other than the end
5951	 *    entry in its node this results in our parent node being empty,
5952	 *    i.e. containing only the end entry.  This is fine for NTFS
5953	 *    B+trees, i.e. index nodes may be empty.  It is only leaf nodes
5954	 *    that may not be empty.
5955	 *
5956	 * 2) As point 1) above but the parent entry of the entry to be removed
5957	 *    does not fit in the leaf node containing its successor thus that
5958	 *    node needs to be split and the split may need to propagate up the
5959	 *    tree which is of course the job of ntfs_index_entry_add().
5960	 *
5961	 * 3) If the parent entry is the end entry of its index node, i.e.
5962	 *    there is no right-hand sibling leaf node then simply insert the
5963	 *    entry on the left of our parent entry at the end of its own child
5964	 *    node, i.e. the leaf node on our left.  Then remove the entry on
5965	 *    the left of our parent entry that we inserted into its child node
5966	 *    from our parent node and change the VCN sub-node pointer of our
5967	 *    parent node to point to our left-hand sibling.  Finally free the
5968	 *    leaf node the entry to be deleted is in which is no longer
5969	 *    referenced from anywhere (thus deleting the entry).
5970	 *
5971	 *    What the above describes is effectively merging the leaf node
5972	 *    containing the entry to be deleted with its left-hand sibling
5973	 *    leaf node and in the process pulling down the entry on the
5974	 *    left-hand side of our parent into the merged node and placing it
5975	 *    after its predecessor entry.
5976	 *
5977	 *    The left-hand sibling of our parent node is of course turned from
5978	 *    and index node entry to a leaf entry as it is pulled down.
5979	 *
5980	 *    Note how if the left-hand sibling of our parent entry is the only
5981	 *    entry other than the end entry, which is also our parent entry,
5982	 *    in its node this results in our parent node being empty, i.e.
5983	 *    containing only the end entry.  This is fine for NTFS B+trees,
5984	 *    i.e. index nodes may be empty.  It is only leaf nodes that may
5985	 *    not be empty.
5986	 *
5987	 * 4) As point 3) above but the left-hand sibling of the parent entry
5988	 *    of the entry to be removed does not fit in the leaf node
5989	 *    containing its predecessor thus that node needs to be split and
5990	 *    the split may need to propagate up the tree which is of course
5991	 *    the job of ntfs_index_entry_add().
5992	 *
5993	 * 5) If the parent entry is the end entry of its index node, i.e.
5994	 *    there is no right-hand sibling leaf node, and there is no entry
5995	 *    on the left of our parent entry, i.e. there is no left-hand
5996	 *    sibling leaf node, i.e. the parent node is completely empty, we
5997	 *    cannot merge with a sibling as there is none thus we need to
5998	 *    record the current state and repeat a very similar procedure as
5999	 *    above points 1-4 except applied to the parent node which is not
6000	 *    a leaf node so we need to be careful with VCN sub-node pointers.
6001	 *    The aim of doing this is to merge the parent node with one of its
6002	 *    siblings so that it is no longer empty so that our leaf node
6003	 *    containing the entry to be deleted has at least one sibling so
6004	 *    that we can merge the leaf node with one of its siblings.  If the
6005	 *    parent node of the parent node has no siblings either we need to
6006	 *    push the recorded state down into the stack of recorded states,
6007	 *    record the new current state, and then go to the next parent and
6008	 *    try to merge that with its sibling and so on until we find a
6009	 *    parent with a sibling that can be merged.  There are two possible
6010	 *    outcomes (not counting errors):
6011	 *    a) The going up and merging will eventually succeed in which case
6012	 *       we go backwards through the stack of recorded states merging
6013	 *       nodes as we go along until we reach the last recorded state at
6014	 *       which point we have reduced the problem to one of the above
6015	 *       points 1-4 thus we repeat them to finish.
6016	 *    b) We reach the index root which by definition cannot have any
6017	 *       siblings.  Also because we reached the index root it means
6018	 *       that the index root must be empty and the end entry is
6019	 *       pointing to the VCN sub-node we came from and because the
6020	 *       entry to be deleted is the last entry in its leaf-node and all
6021	 *       parent nodes including the index root are empty the entry to
6022	 *       be deleted is the very last entry of the directory thus it can
6023	 *       simply be removed by truncating the index allocation attribute
6024	 *       to zero size and if the index bitmap is non-resident
6025	 *       truncating it to zero size, too, and if the index bitmap is
6026	 *       resident zeroing its contents instead of truncating it and
6027	 *       finally the index root is switched to be a small index, which
6028	 *       is reflected in its index flags and in the removal of the VCN
6029	 *       sub-node pointer in the end entry of the index root.
6030	 *   NOTE: Instead of recursing, we do it in one go by simply freeing
6031	 *   all the index node blocks in the tree path until we find a
6032	 *   non-empty node, i.e. a node that can be merged, and we then
6033	 *   perform the merge by pulling down the appropriate index entry but
6034	 *   instead of pulling it down one level we pull it down to before its
6035	 *   successor (if merging to the right) or to after its predecessor (if
6036	 *   merging on the left) which by definition will be in a leaf node.
6037	 *   That give the same results as the recursion but it is much less
6038	 *   work to do and it affects less index nodes.
6039	 *   NOTE2: After implementing NOTE above, we ditch the code handling
6040	 *   cases 1-4 above as they are just special cases of case 5 when
6041	 *   implemented like NOTE.
6042	 *
6043	 * That is quite enough description now, let the games begin.  First,
6044	 * investigate the parent index entry of the leaf entry to be deleted
6045	 * to see whether it is the end entry or not to determine which of the
6046	 * above categories the deletion falls into.
6047	 *
6048	 * We must be at the bottom of the current tree path or things will get
6049	 * very confused.
6050	 */
6051	if (!ictx->down->is_root)
6052		panic("%s(): !ictx->down->is_root\n", __FUNCTION__);
6053	/*
6054	 * We are going to disconnect @ictx thus the tree starting at the index
6055	 * root is no longer referenced by the caller so we need to free it
6056	 * later.
6057	 */
6058	put_ictx = ictx->down;
6059	/*
6060	 * Obtain the parent node, unlock the current node, disconnect it from
6061	 * the tree, and mark it for deallocation later.
6062	 */
6063	parent_ictx = ictx->up;
6064	ntfs_index_ctx_unlock(ictx);
6065	ntfs_index_ctx_disconnect_reinit(ictx);
6066	deallocate_ictx = ictx;
6067	/*
6068	 * Investigate the parent node.  If it is not empty we can immediately
6069	 * proceed with the merge to the right or left.  If it is empty then we
6070	 * need to ascend the tree until we find a non-empty node or until we
6071	 * reach an empty index root in which case the index is becoming empty
6072	 * so we delete its contents by resetting it to zero size.
6073	 */
6074	while (parent_ictx->nr_entries <= 1) {
6075		if (parent_ictx->nr_entries != 1)
6076			panic("%s(): parent_ictx->nr_entries != 1\n",
6077					__FUNCTION__);
6078		if (parent_ictx->is_root) {
6079			/*
6080			 * The index is becoming empty.  To simplify things
6081			 * merge the two sub-trees back together and then empty
6082			 * the index.
6083			 */
6084			deallocate_ictx->up->down = parent_ictx->down;
6085			parent_ictx->down->up = deallocate_ictx->up;
6086			deallocate_ictx->up = parent_ictx;
6087			parent_ictx->down = deallocate_ictx;
6088			return ntfs_index_make_empty(deallocate_ictx);
6089		}
6090		/*
6091		 * Empty index node.  Move it to the list of index nodes to be
6092		 * deallocated and proceed to its parent.
6093		 */
6094		ictx = parent_ictx;
6095		parent_ictx = parent_ictx->up;
6096		ntfs_index_ctx_move(ictx, deallocate_ictx);
6097	}
6098	/*
6099	 * We have a non-empty parent node which we need to lock as we need to
6100	 * access its contents.
6101	 */
6102	if (parent_ictx->is_locked)
6103		panic("%s(): parent_ictx->is_locked\n", __FUNCTION__);
6104	err = ntfs_index_ctx_relock(parent_ictx);
6105	if (err) {
6106		ntfs_error(idx_ni->vol->mp, "Failed to relock the parent "
6107				"index node (error %d).", err);
6108		goto put_err;
6109	}
6110	/* INDEX_NODE == LARGE_INDEX */
6111	if (!(parent_ictx->follow_entry->flags & INDEX_ENTRY_NODE) ||
6112			!(parent_ictx->index->flags & INDEX_NODE))
6113		panic("%s(): !(parent_ictx->follow_entry->flags & "
6114				"INDEX_ENTRY_NODE) || "
6115				"!(parent_ictx->index->flags & INDEX_NODE)\n",
6116				__FUNCTION__);
6117	/*
6118	 * If the parent entry is the end entry in its node we cannot merge to
6119	 * the right so merge to the left instead.
6120	 */
6121	if (parent_ictx->follow_entry->flags & INDEX_ENTRY_END)
6122		goto merge_left;
6123	/*
6124	 * The parent entry is not the end entry in its node so merge on the
6125	 * right.  To do this we need to find the successor (leaf) entry for
6126	 * which we need to repoint the path so we descend into the sub-node of
6127	 * the entry on the right-hand side of the parent entry.
6128	 *
6129	 * We also make a note of the parent entry as we are going to pull it
6130	 * down in front of its successor entry and we then have to delete it
6131	 * from this node later.
6132	 */
6133	pull_down_entry = parent_ictx->follow_entry;
6134	pull_down_entry_nr = parent_ictx->entry_nr;
6135	pull_down_ie_size = le16_to_cpu(pull_down_entry->length);
6136	parent_ictx->follow_entry = (INDEX_ENTRY*)((u8*)pull_down_entry +
6137			pull_down_ie_size);
6138	pull_down_ie_size -= sizeof(VCN);
6139	parent_ictx->entry_nr++;
6140	suc_ictx = parent_ictx;
6141	do {
6142		err = ntfs_index_descend_into_child_node(&suc_ictx);
6143		if (err) {
6144			ntfs_error(idx_ni->vol->mp, "Failed to descend to "
6145					"node containing successor entry "
6146					"(error %d).", err);
6147			goto put_err;
6148		}
6149		if (!suc_ictx->is_locked)
6150			panic("%s(): !suc_ictx->is_locked\n", __FUNCTION__);
6151		suc_ictx->follow_entry = suc_ictx->entries[0];
6152	} while (suc_ictx->index->flags & INDEX_NODE);
6153	if (suc_ictx->follow_entry->flags & INDEX_ENTRY_NODE)
6154		panic("%s(): suc_ictx->follow_entry->flags & "
6155				"INDEX_ENTRY_NODE\n", __FUNCTION__);
6156	/*
6157	 * Can the parent entry simply be inserted in front of its successor
6158	 * leaf entry without overflowing the node?  If not we need to split
6159	 * the node and then restart the delete.
6160	 */
6161	if (pull_down_ie_size > suc_ictx->bytes_free) {
6162split_and_restart_case_4:
6163		ntfs_debug("Splitting index node before add on delete (case "
6164				"4).");
6165		ie_size = pull_down_ie_size;
6166		goto split_and_restart;
6167	}
6168	/*
6169	 * The parent entry can simply be inserted in front of its successor
6170	 * leaf entry.
6171	 *
6172	 * We need to have locked both nodes to be able to do the insert thus
6173	 * we may need to unlock the locked successor node.
6174	 */
6175	if (parent_ictx->is_locked)
6176		panic("%s(): parent_ictx->is_locked\n", __FUNCTION__);
6177	err = ntfs_index_ctx_lock_two(suc_ictx, parent_ictx);
6178	if (err)
6179		goto put_err;
6180	/*
6181	 * We need to re-check the amount of free space as it can have changed
6182	 * whilst we dropped the lock on @suc_ictx if that was necessary.  When
6183	 * this happens simply drop the lock on @parent_ictx if we took it and
6184	 * go back to the previous code dealing with the not enough space case.
6185	 */
6186	if (pull_down_ie_size > suc_ictx->bytes_free) {
6187		if (parent_ictx->is_locked)
6188			ntfs_index_ctx_unlock(parent_ictx);
6189		goto split_and_restart_case_4;
6190	}
6191	/*
6192	 * Both nodes are locked and this is a simple insert, so go ahead and
6193	 * do it.  We have already checked that there is enough space so it
6194	 * cannot fail.
6195	 *
6196	 * Move all the entries out of the way to make space for the parent
6197	 * entry to be copied in.
6198	 */
6199	err = ntfs_index_node_make_space(suc_ictx, pull_down_ie_size);
6200	if (err)
6201		panic("%s(): err\n", __FUNCTION__);
6202	/*
6203	 * Copy the parent index entry into the created space in front of its
6204	 * successor and switch the inserted entry to being a leaf entry.
6205	 */
6206	ie = suc_ictx->follow_entry;
6207	pull_down_entry = parent_ictx->entries[pull_down_entry_nr];
6208	memcpy(ie, pull_down_entry, pull_down_ie_size);
6209	ie->length = cpu_to_le16(pull_down_ie_size);
6210	ie->flags &= ~INDEX_ENTRY_NODE;
6211	/* Ensure the updates are written to disk. */
6212	ntfs_index_entry_mark_dirty(suc_ictx);
6213	/*
6214	 * We are done with the successor node so release it or transfer it to
6215	 * the parent node if they share the same page (in which case
6216	 * @parent_ictx is not marked locked).
6217	 */
6218	if (!suc_ictx->is_locked)
6219		panic("%s(): !suc_ictx->is_locked\n", __FUNCTION__);
6220	if (parent_ictx->is_locked) {
6221		/*
6222		 * @parent_ictx is locked thus it cannot be sharing the lock
6223		 * with @suc_ictx.  Unlock @suc_ictx.
6224		 */
6225		ntfs_index_ctx_unlock(suc_ictx);
6226	} else {
6227		/*
6228		 * @parent_ictx is not locked thus it must be sharing the lock
6229		 * with @suc_ictx.  Transfer the lock across.
6230		 */
6231		if (parent_ictx->is_root)
6232			panic("%s(): parent_ictx->is_root\n", __FUNCTION__);
6233		if (parent_ictx->upl_ofs != suc_ictx->upl_ofs)
6234			panic("%s(): parent_ictx->upl_ofs != "
6235					"suc_ictx->upl_ofs\n", __FUNCTION__);
6236		parent_ictx->is_locked = 1;
6237		suc_ictx->is_locked = 0;
6238	}
6239	if (!parent_ictx->is_locked)
6240		panic("%s(): parent_ictx->is_locked\n", __FUNCTION__);
6241	/*
6242	 * We have now inserted the parent entry in front of its successor but
6243	 * at the moment we have two copies of the parent entry, so delete the
6244	 * original copy from the parent node.  We need to switch the parent
6245	 * index context to be the current context and to point to the parent
6246	 * entry we pulled down so we can just pretend it is the entry to be
6247	 * deleted.  Note this is a simple delete just like for a leaf entry as
6248	 * we have taken care of its child leaf node(s) already and will
6249	 * deallocate it(them) later.
6250	 */
6251	ictx = parent_ictx;
6252	parent_ictx->entry = pull_down_entry;
6253	parent_ictx->entry_nr = pull_down_entry_nr;
6254	goto prep_simple_delete;
6255merge_left:
6256	/*
6257	 * The parent entry is the end entry in its node so merge on the left.
6258	 * To do this we need to find the predecessor (leaf) entry for which we
6259	 * need to repoint the path so we descend into the sub-node of the
6260	 * entry on the left-hand side of the parent entry.  Once found we need
6261	 * to pull down the new parent entry, i.e. the entry on the left of the
6262	 * current parent entry, behind its predecessor entry.
6263	 *
6264	 * We also make a note of the original parent entry as we have to
6265	 * change its VCN sub-node pointer later.
6266	 */
6267	old_parent_entry_nr = parent_ictx->entry_nr;
6268	old_parent_ie_size = le16_to_cpu(parent_ictx->follow_entry->length) -
6269			sizeof(VCN);
6270	parent_ictx->entry_nr--;
6271	parent_ictx->follow_entry =
6272			parent_ictx->entries[parent_ictx->entry_nr];
6273	pull_down_ie_size = le16_to_cpu(parent_ictx->follow_entry->length) -
6274			sizeof(VCN);
6275	suc_ictx = parent_ictx;
6276	do {
6277		err = ntfs_index_descend_into_child_node(&suc_ictx);
6278		if (err) {
6279			ntfs_error(idx_ni->vol->mp, "Failed to descend to "
6280					"node containing predecessor entry of "
6281					"left-hand sibling entry (error %d).",
6282					err);
6283			goto put_err;
6284		}
6285		if (!suc_ictx->is_locked)
6286			panic("%s(): !suc_ictx->is_locked\n", __FUNCTION__);
6287		suc_ictx->entry_nr = suc_ictx->nr_entries - 1;
6288		suc_ictx->follow_entry = suc_ictx->entries[suc_ictx->entry_nr];
6289	} while (suc_ictx->index->flags & INDEX_NODE);
6290	if (suc_ictx->follow_entry->flags & INDEX_ENTRY_NODE)
6291		panic("%s(): suc_ictx->follow_entry->flags & "
6292				"INDEX_ENTRY_NODE\n", __FUNCTION__);
6293	/*
6294	 * Can the entry on the left-hand side of the parent entry simply be
6295	 * inserted in after its predecessor leaf entry, i.e. before the end
6296	 * entry in the leaf node containing the predecessor leaf entry,
6297	 * without overflowing the node?  If not we need to split the node and
6298	 * then restart the delete.
6299	 */
6300	if (pull_down_ie_size > suc_ictx->bytes_free) {
6301split_and_restart_case_5:
6302		ntfs_debug("Splitting index node before add on delete (case "
6303				"5).");
6304		ie_size = pull_down_ie_size;
6305		goto split_and_restart;
6306	}
6307	/*
6308	 * The entry on the left-hand side of the parent entry can simply be
6309	 * inserted in front of the end entry of the leaf node containing its
6310	 * predecessor.
6311	 *
6312	 * We need to have locked both nodes to be able to do the insert thus
6313	 * we may need to unlock the locked predecessor node to ensure that
6314	 * page locks are only taken in ascending page index order so we avoid
6315	 * deadlocks.
6316	 */
6317	if (parent_ictx->is_locked)
6318		panic("%s(): parent_ictx->is_locked\n", __FUNCTION__);
6319	err = ntfs_index_ctx_lock_two(suc_ictx, parent_ictx);
6320	if (err)
6321		goto put_err;
6322	/*
6323	 * We need to re-check the amount of free space as it can have changed
6324	 * whilst we dropped the lock on @suc_ictx if that was necessary.  When
6325	 * this happens simply drop the lock on @parent_ictx if we took it and
6326	 * go back to the previous code dealing with the not enough space case.
6327	 */
6328	if (pull_down_ie_size > suc_ictx->bytes_free) {
6329		if (parent_ictx->is_locked)
6330			ntfs_index_ctx_unlock(parent_ictx);
6331		goto split_and_restart_case_5;
6332	}
6333	/*
6334	 * Both nodes are locked and this is a simple insert, so go ahead and
6335	 * do it.  We have already checked that there is enough space so it
6336	 * cannot fail.
6337	 *
6338	 * Move the end entry out of the way to make space for the entry to be
6339	 * copied in.
6340	 */
6341	err = ntfs_index_node_make_space(suc_ictx, pull_down_ie_size);
6342	if (err)
6343		panic("%s(): err\n", __FUNCTION__);
6344	/*
6345	 * Copy the parent index entry into the created space in front of its
6346	 * predecessor and switch the inserted entry to being a leaf entry.
6347	 */
6348	ie = suc_ictx->follow_entry;
6349	memcpy(ie, parent_ictx->follow_entry, pull_down_ie_size);
6350	ie->length = cpu_to_le16(pull_down_ie_size);
6351	ie->flags &= ~INDEX_ENTRY_NODE;
6352	/* Ensure the updates are written to disk. */
6353	ntfs_index_entry_mark_dirty(suc_ictx);
6354	/*
6355	 * We are done with the predecessor node so release it or transfer it
6356	 * to the parent node if they share the same page (in which case
6357	 * @parent_ictx is not marked locked).
6358	 */
6359	if (!suc_ictx->is_locked)
6360		panic("%s(): !suc_ictx->is_locked\n", __FUNCTION__);
6361	if (parent_ictx->is_locked) {
6362		/*
6363		 * @parent_ictx is locked thus it cannot be sharing the lock
6364		 * with @suc_ictx.  Unlock @suc_ictx.
6365		 */
6366		ntfs_index_ctx_unlock(suc_ictx);
6367	} else {
6368		/*
6369		 * @parent_ictx is not locked thus it must be sharing the lock
6370		 * with @suc_ictx.  Transfer the lock across.
6371		 */
6372		if (parent_ictx->is_root)
6373			panic("%s(): parent_ictx->is_root\n", __FUNCTION__);
6374		if (parent_ictx->upl_ofs != suc_ictx->upl_ofs)
6375			panic("%s(): parent_ictx->upl_ofs != "
6376					"suc_ictx->upl_ofs\n", __FUNCTION__);
6377		parent_ictx->is_locked = 1;
6378		suc_ictx->is_locked = 0;
6379	}
6380	if (!parent_ictx->is_locked)
6381		panic("%s(): parent_ictx->is_locked\n", __FUNCTION__);
6382	/*
6383	 * We have now inserted the left-hand sibling entry of the parent entry
6384	 * in front of the end entry of the leaf node containing its
6385	 * predecessor entry but at the moment we have two copies of the entry,
6386	 * so delete the original copy from its index node.
6387	 *
6388	 * Before we do that, update the VCN sub-node pointer of the original
6389	 * parent entry to point to the index node that is about to become
6390	 * parentless.  Note we do not mark the index node dirty as that will
6391	 * happen when the original copy of the entry we pulled down is
6392	 * deleted.
6393	 */
6394	*(leVCN*)((u8*)parent_ictx->entries[old_parent_entry_nr] +
6395			old_parent_ie_size) = *(leVCN*)(
6396			(u8*)parent_ictx->follow_entry + pull_down_ie_size);
6397	/*
6398	 * All that is left now is to delete the original copy of the entry we
6399	 * pulled down.  We need to switch the parent index context to be the
6400	 * current context as it already points to the entry we pulled down
6401	 * thus we can just pretend it is the entry to be deleted.  Note this
6402	 * is a simple delete just like for a leaf entry as we have taken care
6403	 * of transfering its child leaf node(s) to the original parent entry
6404	 * already and will deallocate the origial parent entry's child leaf
6405	 * node(s) later.
6406	 */
6407	ictx = parent_ictx;
6408prep_simple_delete:
6409	ie = ictx->entry;
6410	is_root = ictx->is_root;
6411	ictx->is_match = 1;
6412	ih = ictx->index;
6413	ie_size = le16_to_cpu(ie->length);
6414	next_ie = (INDEX_ENTRY*)((u8*)ie + ie_size);
6415	/* goto simple_delete; */
6416simple_delete:
6417	new_ilen = le32_to_cpu(ih->index_length) - ie_size;
6418	if (is_root) {
6419		ATTR_RECORD *a;
6420		u32 muse;
6421
6422		m = ictx->actx->m;
6423		muse = le32_to_cpu(m->bytes_in_use);
6424		/*
6425		 * Move the index entries following @ie into @ie's position.
6426		 * As an optimization we combine the moving of the index
6427		 * entries and the resizing of the index root attribute into a
6428		 * single operation.
6429		 */
6430		memmove(ie, next_ie, muse - ((u8*)next_ie - (u8*)m));
6431		/* Adjust the mft record to reflect the change in used space. */
6432		m->bytes_in_use = cpu_to_le32(muse - ie_size);
6433		/*
6434		 * Adjust the attribute record to reflect the change in
6435		 * attribute value and attribute record size.
6436		 */
6437		a = ictx->actx->a;
6438		a->length = cpu_to_le32(le32_to_cpu(a->length) - ie_size);
6439		a->value_length = cpu_to_le32(le32_to_cpu(a->value_length) -
6440				ie_size);
6441		/* Adjust the index header to reflect the change in length. */
6442		ih->allocated_size = ih->index_length = cpu_to_le32(new_ilen);
6443	} else {
6444		/* Move index entries following @ie into @ie's position. */
6445		memmove(ie, next_ie, new_ilen - ((u8*)ie - (u8*)ih));
6446		/* Adjust the index header to reflect the change in length. */
6447		ih->index_length = cpu_to_le32(new_ilen);
6448	}
6449	/* Ensure the updates are written to disk. */
6450	ntfs_index_entry_mark_dirty(ictx);
6451	/*
6452	 * If we have scheduled any index block nodes to be deallocated do it
6453	 * now but first unlock the node we just deleted an entry from so we do
6454	 * not deadlock.
6455	 */
6456	if (deallocate_ictx) {
6457		ntfs_index_ctx_unlock(ictx);
6458		ictx = deallocate_ictx;
6459		do {
6460			if (ictx->is_root)
6461				panic("%s(): ictx->is_root\n", __FUNCTION__);
6462			err = ntfs_index_block_free(ictx);
6463			if (err) {
6464				ntfs_error(idx_ni->vol->mp, "Failed to "
6465						"deallocate no longer used "
6466						"index block VCN 0x%llx "
6467						"(error %d).  Run chkdsk to "
6468						"recover the lost index "
6469						"block.",
6470						(unsigned long long)ictx->vcn,
6471						err);
6472				NVolSetErrors(idx_ni->vol);
6473			}
6474		} while ((ictx = ictx->up) != deallocate_ictx);
6475	}
6476	/*
6477	 * Release any sub-tree paths that the caller no longer has a reference
6478	 * to.
6479	 */
6480	if (put_ictx)
6481		ntfs_index_ctx_put(put_ictx);
6482	ntfs_debug("Done.");
6483	return 0;
6484split_and_restart:
6485	/*
6486	 * Split the index node described by @suc_ictx taking into account the
6487	 * fact that it is very likely that an entry of @ie_size will be added
6488	 * to the index in front of the position pointed to by @suc_ictx.
6489	 */
6490	err = ntfs_index_node_split(suc_ictx, ie_size);
6491	if (!err) {
6492		/*
6493		 * Signal to the caller that they need to restart the delete
6494		 * from scratch.
6495		 */
6496		err = -EAGAIN;
6497	}
6498put_err:
6499	if (put_ictx)
6500		ntfs_index_ctx_put(put_ictx);
6501	return err;
6502}
6503