1/*
2 * ntfs_attr.c - NTFS kernel attribute operations.
3 *
4 * Copyright (c) 2006-2011 Anton Altaparmakov.  All Rights Reserved.
5 * Portions Copyright (c) 2006-2011 Apple Inc.  All Rights Reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 *    this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 * 3. Neither the name of Apple Inc. ("Apple") nor the names of its
16 *    contributors may be used to endorse or promote products derived from this
17 *    software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
20 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
23 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
26 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 *
30 * ALTERNATIVELY, provided that this notice and licensing terms are retained in
31 * full, this file may be redistributed and/or modified under the terms of the
32 * GNU General Public License (GPL) Version 2, in which case the provisions of
33 * that version of the GPL will apply to you instead of the license terms
34 * above.  You can obtain a copy of the GPL Version 2 at
35 * http://developer.apple.com/opensource/licenses/gpl-2.txt.
36 */
37
38#include <sys/errno.h>
39#include <sys/stat.h>
40#include <sys/ucred.h>
41#include <sys/ubc.h>
42
43#include <string.h>
44
45#include <libkern/libkern.h>
46#include <libkern/OSMalloc.h>
47
48#include <kern/debug.h>
49#include <kern/sched_prim.h>
50
51#include "ntfs.h"
52#include "ntfs_attr.h"
53#include "ntfs_attr_list.h"
54#include "ntfs_debug.h"
55#include "ntfs_dir.h"
56#include "ntfs_endian.h"
57#include "ntfs_index.h"
58#include "ntfs_inode.h"
59#include "ntfs_layout.h"
60#include "ntfs_lcnalloc.h"
61#include "ntfs_mft.h"
62#include "ntfs_page.h"
63#include "ntfs_runlist.h"
64#include "ntfs_time.h"
65#include "ntfs_types.h"
66#include "ntfs_unistr.h"
67
68ntfschar AT_UNNAMED[1] = { 0 };
69
70/**
71 * ntfs_attr_map_runlist - map the whole runlist of an ntfs inode
72 * @ni:		ntfs inode for which to map the whole runlist
73 *
74 * Map the whole runlist of the ntfs inode @ni.
75 *
76 * Return 0 on success and errno on error.
77 *
78 * Note this function requires the runlist not to be mapped yet at all.  This
79 * limitation is ok because we only use this function at mount time to map the
80 * runlist of some system files thus we are guaranteed that they will not have
81 * any runlist fragments mapped yet.
82 *
83 * Note the runlist can be NULL after this function returns if the attribute
84 * has zero allocated size, i.e. there simply is no runlist.
85 */
86errno_t ntfs_attr_map_runlist(ntfs_inode *ni)
87{
88	VCN vcn, end_vcn;
89	ntfs_inode *base_ni;
90	MFT_RECORD *m;
91	ntfs_attr_search_ctx *ctx;
92	ATTR_RECORD *a;
93	errno_t err = 0;
94
95	ntfs_debug("Entering for mft_no 0x%llx, type 0x%x.",
96			(unsigned long long)ni->mft_no,
97			(unsigned)le32_to_cpu(ni->type));
98	/* If the attribute is resident there is nothing to do. */
99	if (!NInoNonResident(ni)) {
100		ntfs_debug("Done (resident, nothing to do).");
101		return 0;
102	}
103	lck_rw_lock_exclusive(&ni->rl.lock);
104	/* Verify that the runlist is not mapped yet. */
105	if (ni->rl.alloc && ni->rl.elements)
106		panic("%s(): ni->rl.alloc && ni->rl.elements\n", __FUNCTION__);
107	base_ni = ni;
108	if (NInoAttr(ni))
109		base_ni = ni->base_ni;
110	err = ntfs_mft_record_map(base_ni, &m);
111	if (err)
112		goto err;
113	ctx = ntfs_attr_search_ctx_get(base_ni, m);
114	if (!ctx) {
115		err = ENOMEM;
116		goto unm_err;
117	}
118	vcn = 0;
119	end_vcn = ni->allocated_size >> ni->vol->cluster_size_shift;
120	do {
121		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, vcn,
122				NULL, 0, ctx);
123		if (err) {
124			if (err == ENOENT)
125				err = EIO;
126			break;
127		}
128		a = ctx->a;
129		if (!a->non_resident) {
130corrupt_err:
131			ntfs_error(ni->vol->mp, "Inode 0x%llx contains corrupt "
132					"attribute extent, run chkdsk.",
133					(unsigned long long)base_ni->mft_no);
134			NVolSetErrors(ni->vol);
135			err = EIO;
136			break;
137		}
138		/*
139		 * If we are in the first attribute extent, verify the cached
140		 * allocated size is correct.
141		 */
142		if (!a->lowest_vcn)
143			if (sle64_to_cpu(a->allocated_size) !=
144					ni->allocated_size)
145				panic("%s(): sle64_to_cpu(a->allocated_size) "
146						"!= ni->allocated_size\n",
147						__FUNCTION__);
148		/*
149		 * Sanity check the lowest_vcn of the attribute is equal to the
150		 * vcn we looked up and that the highest_vcn of the attribute
151		 * is above the current vcn.
152		 */
153		if (sle64_to_cpu(a->lowest_vcn) != vcn || (vcn &&
154				sle64_to_cpu(a->highest_vcn) < vcn))
155			goto corrupt_err;
156		/* Determine the next vcn. */
157		vcn = sle64_to_cpu(a->highest_vcn) + 1;
158		/*
159		 * Finally, map the runlist fragment contained in this
160		 * attribute extent.
161		 */
162		err = ntfs_mapping_pairs_decompress(ni->vol, a, &ni->rl);
163	} while (!err && vcn < end_vcn);
164unm_err:
165	ntfs_attr_search_ctx_put(ctx);
166	ntfs_mft_record_unmap(base_ni);
167err:
168	lck_rw_unlock_exclusive(&ni->rl.lock);
169	if (!err)
170		ntfs_debug("Done.");
171	else
172		ntfs_error(ni->vol->mp, "Failed (error %d).", (int)err);
173	return err;
174}
175
176/**
177 * ntfs_map_runlist_nolock - map (a part of) a runlist of an ntfs inode
178 * @ni:		ntfs inode for which to map (part of) a runlist
179 * @vcn:	map runlist part containing this vcn
180 * @ctx:	active attribute search context if present or NULL if not
181 *
182 * Map the part of a runlist containing the @vcn of the ntfs inode @ni.
183 *
184 * If @ctx is specified, it is an active search context of @ni and its base mft
185 * record.  This is needed when ntfs_map_runlist_nolock() encounters unmapped
186 * runlist fragments and allows their mapping.  If you do not have the mft
187 * record mapped, you can specify @ctx as NULL and ntfs_map_runlist_nolock()
188 * will perform the necessary mapping and unmapping.
189 *
190 * Note, ntfs_map_runlist_nolock() saves the state of @ctx on entry and
191 * restores it before returning.  Thus, @ctx will be left pointing to the same
192 * attribute on return as on entry.  However, the actual pointers in @ctx may
193 * point to different memory locations on return, so you must remember to reset
194 * any cached pointers from the @ctx, i.e. after the call to
195 * ntfs_map_runlist_nolock(), you will probably want to do:
196 *	m = ctx->m;
197 *	a = ctx->a;
198 * Assuming you cache ctx->a in a variable @a of type ATTR_RECORD * and that
199 * you cache ctx->m in a variable @m of type MFT_RECORD *.
200 *
201 * Return 0 on success and errno on error.  There is one special error code
202 * which is not an error as such.  This is ENOENT.  It means that @vcn is out
203 * of bounds of the runlist.
204 *
205 * Note the runlist can be NULL after this function returns if @vcn is zero and
206 * the attribute has zero allocated size, i.e. there simply is no runlist.
207 *
208 * WARNING: If @ctx is supplied, regardless of whether success or failure is
209 *	    returned, you need to check @ctx->is_error and if 1 the @ctx is no
210 *	    longer valid, i.e. you need to either call
211 *	    ntfs_attr_search_ctx_reinit() or ntfs_attr_search_ctx_put() on it.
212 *	    In that case @ctx->error will give you the error code for why the
213 *	    mapping of the old inode failed.
214 *	    Also if @ctx is supplied and the current attribute (or the mft
215 *	    record it is in) has been modified then the caller must call
216 *	    NInoSetMrecNeedsDirtying(ctx->ni); before calling
217 *	    ntfs_map_runlist_nolock() or the changes may be lost.
218 *
219 * Locking: - The runlist described by @ni must be locked for writing on entry
220 *	      and is locked on return.  Note the runlist will be modified.
221 *	    - If @ctx is NULL, the base mft record of @ni must not be mapped on
222 *	      entry and it will be left unmapped on return.
223 *	    - If @ctx is not NULL, the base mft record must be mapped on entry
224 *	      and it will be left mapped on return.
225 */
226errno_t ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn,
227		ntfs_attr_search_ctx *ctx)
228{
229	VCN end_vcn;
230	ntfs_inode *base_ni;
231	MFT_RECORD *m;
232	ATTR_RECORD *a;
233	errno_t err = 0;
234	BOOL ctx_is_temporary, ctx_needs_reset;
235	ntfs_attr_search_ctx old_ctx = { { NULL, }, };
236
237	ntfs_debug("Entering for mft_no 0x%llx, vcn 0x%llx.",
238			(unsigned long long)ni->mft_no,
239			(unsigned long long)vcn);
240	base_ni = ni;
241	if (NInoAttr(ni))
242		base_ni = ni->base_ni;
243	if (!ctx) {
244		ctx_is_temporary = ctx_needs_reset = TRUE;
245		err = ntfs_mft_record_map(base_ni, &m);
246		if (err)
247			goto done;
248		ctx = ntfs_attr_search_ctx_get(base_ni, m);
249		if (!ctx) {
250			err = ENOMEM;
251			goto err;
252		}
253	} else {
254		VCN allocated_size_vcn;
255
256		if (ctx->is_error)
257			panic("%s(): ctx->is_error\n", __FUNCTION__);
258		a = ctx->a;
259		if (!a->non_resident)
260			panic("%s(): !a->non_resident\n", __FUNCTION__);
261		ctx_is_temporary = FALSE;
262		end_vcn = sle64_to_cpu(a->highest_vcn);
263		lck_spin_lock(&ni->size_lock);
264		allocated_size_vcn = ni->allocated_size >>
265				ni->vol->cluster_size_shift;
266		lck_spin_unlock(&ni->size_lock);
267		/*
268		 * If we already have the attribute extent containing @vcn in
269		 * @ctx, no need to look it up again.  We slightly cheat in
270		 * that if vcn exceeds the allocated size, we will refuse to
271		 * map the runlist below, so there is definitely no need to get
272		 * the right attribute extent.
273		 */
274		if (vcn >= allocated_size_vcn || (a->type == ni->type &&
275				a->name_length == ni->name_len &&
276				!bcmp((u8*)a + le16_to_cpu(a->name_offset),
277				ni->name, ni->name_len) &&
278				sle64_to_cpu(a->lowest_vcn) <= vcn &&
279				end_vcn >= vcn))
280			ctx_needs_reset = FALSE;
281		else {
282			/* Save the old search context. */
283			old_ctx = *ctx;
284			/*
285			 * Reinitialize the search context so we can lookup the
286			 * needed attribute extent.
287			 */
288			ntfs_attr_search_ctx_reinit(ctx);
289			ctx_needs_reset = TRUE;
290		}
291	}
292	if (ctx_needs_reset) {
293		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, vcn,
294				NULL, 0, ctx);
295		if (err) {
296			if (err == ENOENT)
297				err = EIO;
298			goto err;
299		}
300		if (!ctx->a->non_resident)
301			panic("%s(): !a->non_resident!\n", __FUNCTION__);
302	}
303	a = ctx->a;
304	/*
305	 * Only decompress the mapping pairs if @vcn is inside it.  Otherwise
306	 * we get into problems when we try to map an out of bounds vcn because
307	 * we then try to map the already mapped runlist fragment and
308	 * ntfs_mapping_pairs_decompress() fails.
309	 */
310	end_vcn = sle64_to_cpu(a->highest_vcn) + 1;
311	if (vcn && vcn >= end_vcn) {
312		err = ENOENT;
313		goto err;
314	}
315	err = ntfs_mapping_pairs_decompress(ni->vol, a, &ni->rl);
316err:
317	if (ctx_is_temporary) {
318		if (ctx)
319			ntfs_attr_search_ctx_put(ctx);
320		ntfs_mft_record_unmap(base_ni);
321	} else if (ctx_needs_reset) {
322		/*
323		 * If there is no attribute list, restoring the search context
324		 * is acomplished simply by copying the saved context back over
325		 * the caller supplied context.  If there is an attribute list,
326		 * things are more complicated as we need to deal with mapping
327		 * of mft records and resulting potential changes in pointers.
328		 */
329		if (NInoAttrList(base_ni)) {
330			/*
331			 * If the currently mapped (extent) inode is not the
332			 * one we had before, we need to unmap it and map the
333			 * old one.
334			 */
335			if (ctx->ni != old_ctx.ni) {
336				/*
337				 * If the currently mapped inode is not the
338				 * base inode, unmap it.
339				 */
340				if (ctx->base_ni && ctx->ni != ctx->base_ni) {
341					ntfs_extent_mft_record_unmap(ctx->ni);
342					ctx->m = ctx->base_m;
343					if (!ctx->m)
344						panic("%s(): !ctx->m\n",
345								__FUNCTION__);
346				}
347				/*
348				 * If the old mapped inode is not the base
349				 * inode, map it.
350				 */
351				if (old_ctx.base_ni && old_ctx.ni !=
352						old_ctx.base_ni) {
353					errno_t err2;
354retry_map:
355					err2 = ntfs_mft_record_map(old_ctx.ni,
356							&ctx->m);
357					/*
358					 * Something bad has happened.  If out
359					 * of memory retry till it succeeds.
360					 * Any other errors are fatal and we
361					 * return the error code in ctx->m.
362					 * Let the caller deal with it...  We
363					 * just need to fudge things so the
364					 * caller can reinit and/or put the
365					 * search context safely.
366					 */
367					if (err2) {
368						if (err2 == ENOMEM) {
369							(void)thread_block(
370							THREAD_CONTINUE_NULL);
371							goto retry_map;
372						}
373						ctx->is_error = 1;
374						ctx->error = err2;
375						old_ctx.ni = old_ctx.base_ni;
376					}
377				}
378			}
379			if (ctx->is_error) {
380				old_ctx.is_error = 1;
381				old_ctx.error = ctx->error;
382			} else if (ctx->m != old_ctx.m) {
383				/*
384				 * Update the changed pointers in the saved
385				 * context.
386				 */
387				old_ctx.a = (ATTR_RECORD*)((u8*)ctx->m +
388						((u8*)old_ctx.a -
389						(u8*)old_ctx.m));
390				old_ctx.m = ctx->m;
391			}
392		}
393		/* Restore the search context to the saved one. */
394		*ctx = old_ctx;
395	}
396done:
397	ntfs_debug("Done (error %d).", (int)err);
398	return err;
399}
400
401/**
402 * ntfs_attr_vcn_to_lcn_nolock - convert a vcn into a lcn given an ntfs inode
403 * @ni:			ntfs inode of the attribute whose runlist to search
404 * @vcn:		vcn to convert
405 * @write_locked:	true if the runlist is locked for writing
406 * @clusters:		optional destination for number of contiguous clusters
407 *
408 * Find the virtual cluster number @vcn in the runlist of the ntfs attribute
409 * described by the ntfs inode @ni and return the corresponding logical cluster
410 * number (lcn).
411 *
412 * If the @vcn is not mapped yet, the attempt is made to map the attribute
413 * extent containing the @vcn and the vcn to lcn conversion is retried.
414 *
415 * If @write_locked is true the caller has locked the runlist for writing and
416 * if false for reading.
417 *
418 * If @clusters is not NULL, on success (i.e. we return >= LCN_HOLE) we return
419 * the number of contiguous clusters after the returned lcn in *@clusters.
420 *
421 * Since lcns must be >= 0, we use negative return codes with special meaning:
422 *
423 * Return code	Meaning / Description
424 * ==========================================
425 *  LCN_HOLE	Hole / not allocated on disk.
426 *  LCN_ENOENT	There is no such vcn in the runlist, i.e. @vcn is out of bounds.
427 *  LCN_ENOMEM	Not enough memory to map runlist.
428 *  LCN_EIO	Critical error (runlist/file is corrupt, i/o error, etc).
429 *
430 * Locking: - The runlist must be locked on entry and is left locked on return.
431 *	    - If @write_locked is FALSE, i.e. the runlist is locked for reading,
432 *	      the lock may be dropped inside the function so you cannot rely on
433 *	      the runlist still being the same when this function returns.
434 */
435LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
436		const BOOL write_locked, s64 *clusters)
437{
438	LCN lcn;
439	BOOL need_lock_switch = FALSE;
440	BOOL is_retry = FALSE;
441
442	ntfs_debug("Entering for mft_no 0x%llx, vcn 0x%llx, %s_locked.",
443			(unsigned long long)ni->mft_no,
444			(unsigned long long)vcn,
445			write_locked ? "write" : "read");
446	if (!NInoNonResident(ni))
447		panic("%s(): !NInoNonResident(ni)\n", __FUNCTION__);
448	if (vcn < 0)
449		panic("%s(): vcn < 0\n", __FUNCTION__);
450retry_remap:
451	if (!ni->rl.elements) {
452		lck_spin_lock(&ni->size_lock);
453		if (!ni->allocated_size) {
454			lck_spin_unlock(&ni->size_lock);
455			lcn = LCN_ENOENT;
456			goto lcn_enoent;
457		}
458		lck_spin_unlock(&ni->size_lock);
459		if (!is_retry)
460			goto try_to_map;
461		lcn = LCN_EIO;
462		goto lcn_eio;
463	}
464	/* Convert vcn to lcn.  If that fails map the runlist and retry once. */
465	lcn = ntfs_rl_vcn_to_lcn(ni->rl.rl, vcn, clusters);
466	if (lcn >= LCN_HOLE) {
467		if (need_lock_switch)
468			lck_rw_lock_exclusive_to_shared(&ni->rl.lock);
469		ntfs_debug("Done (lcn 0x%llx, clusters 0x%llx).",
470				(unsigned long long)lcn,
471				clusters ? (unsigned long long)*clusters : 0);
472		return lcn;
473	}
474	if (lcn != LCN_RL_NOT_MAPPED) {
475		if (lcn != LCN_ENOENT)
476			lcn = LCN_EIO;
477	} else if (!is_retry) {
478		errno_t err;
479
480try_to_map:
481		if (!write_locked && !need_lock_switch) {
482			need_lock_switch = TRUE;
483			/*
484			 * If converting the lock from shared to exclusive
485			 * fails, need to take the lock for writing and retry
486			 * in case the racing process did the mapping for us.
487			 */
488			if (!lck_rw_lock_shared_to_exclusive(&ni->rl.lock)) {
489				lck_rw_lock_exclusive(&ni->rl.lock);
490				goto retry_remap;
491			}
492		}
493		err = ntfs_map_runlist_nolock(ni, vcn, NULL);
494		if (!err) {
495			is_retry = TRUE;
496			goto retry_remap;
497		}
498		switch (err) {
499		case ENOENT:
500			lcn = LCN_ENOENT;
501			break;
502		case ENOMEM:
503			lcn = LCN_ENOMEM;
504			break;
505		default:
506			lcn = LCN_EIO;
507		}
508	}
509lcn_eio:
510	if (need_lock_switch)
511		lck_rw_lock_exclusive_to_shared(&ni->rl.lock);
512	if (lcn == LCN_ENOENT) {
513lcn_enoent:
514		ntfs_debug("Done (LCN_ENOENT).");
515	} else
516		ntfs_error(ni->vol->mp, "Failed (error %lld).", (long long)lcn);
517	return lcn;
518}
519
520/**
521 * ntfs_attr_find_vcn_nolock - find a vcn in the runlist of an ntfs inode
522 * @ni:		ntfs inode of the attribute whose runlist to search
523 * @vcn:	vcn to find
524 * @run:	return pointer for the found runlist element
525 * @ctx:	active attribute search context if present or NULL if not
526 *
527 * Find the virtual cluster number @vcn in the runlist of the ntfs attribute
528 * described by the ntfs inode @ni and return the address of the runlist
529 * element containing the @vcn in *@run.
530 *
531 * If the @vcn is not mapped yet, the attempt is made to map the attribute
532 * extent containing the @vcn and the vcn to lcn conversion is retried.
533 *
534 * If @ctx is specified, it is an active search context of @ni and its base mft
535 * record.  This is needed when ntfs_attr_find_vcn_nolock() encounters unmapped
536 * runlist fragments and allows their mapping.  If you do not have the mft
537 * record mapped, you can specify @ctx as NULL and ntfs_attr_find_vcn_nolock()
538 * will perform the necessary mapping and unmapping.
539 *
540 * Note, ntfs_attr_find_vcn_nolock() saves the state of @ctx on entry and
541 * restores it before returning.  Thus, @ctx will be left pointing to the same
542 * attribute on return as on entry.  However, the actual pointers in @ctx may
543 * point to different memory locations on return, so you must remember to reset
544 * any cached pointers from the @ctx, i.e. after the call to
545 * ntfs_attr_find_vcn_nolock(), you will probably want to do:
546 *	m = ctx->m;
547 *	a = ctx->a;
548 * Assuming you cache ctx->a in a variable @a of type ATTR_RECORD * and that
549 * you cache ctx->m in a variable @m of type MFT_RECORD *.
550 * Note you need to distinguish between the lcn of the returned runlist element
551 * being >= 0 and LCN_HOLE.  In the later case you have to return zeroes on
552 * read and allocate clusters on write.
553 *
554 * Return 0 on success and errno on error.
555 *
556 * The possible error return codes are:
557 *	ENOENT	- No such vcn in the runlist, i.e. @vcn is out of bounds.
558 *	ENOMEM	- Not enough memory to map runlist.
559 *	EIO	- Critical error (runlist/file is corrupt, i/o error, etc).
560 *
561 * WARNING: If @ctx is supplied, regardless of whether success or failure is
562 *	    returned, you need to check @ctx->is_error and if 1 the @ctx is no
563 *	    longer valid, i.e. you need to either call
564 *	    ntfs_attr_search_ctx_reinit() or ntfs_attr_search_ctx_put() on it.
565 *	    In that case @ctx->error will give you the error code for why the
566 *	    mapping of the old inode failed.
567 *	    Also if @ctx is supplied and the current attribute (or the mft
568 *	    record it is in) has been modified then the caller must call
569 *	    NInoSetMrecNeedsDirtying(ctx->ni); before calling
570 *	    ntfs_map_runlist_nolock() or the changes may be lost.
571 *
572 * Locking: - The runlist described by @ni must be locked for writing on entry
573 *	      and is locked on return.  Note the runlist may be modified when
574 *	      needed runlist fragments need to be mapped.
575 *	    - If @ctx is NULL, the base mft record of @ni must not be mapped on
576 *	      entry and it will be left unmapped on return.
577 *	    - If @ctx is not NULL, the base mft record must be mapped on entry
578 *	      and it will be left mapped on return.
579 */
580errno_t ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn,
581		ntfs_rl_element **run, ntfs_attr_search_ctx *ctx)
582{
583	ntfs_rl_element *rl;
584	errno_t err = 0;
585	BOOL is_retry = FALSE;
586
587	ntfs_debug("Entering for mft_no 0x%llx, vcn 0x%llx, with%s ctx.",
588			(unsigned long long)ni->mft_no,
589			(unsigned long long)vcn, ctx ? "" : "out");
590	if (!NInoNonResident(ni))
591		panic("%s(): !NInoNonResident(ni)\n", __FUNCTION__);
592	if (vcn < 0)
593		panic("%s(): vcn < 0\n", __FUNCTION__);
594retry_remap:
595	if (!ni->rl.elements) {
596		lck_spin_lock(&ni->size_lock);
597		if (!ni->allocated_size) {
598			lck_spin_unlock(&ni->size_lock);
599			return LCN_ENOENT;
600		}
601		lck_spin_unlock(&ni->size_lock);
602		if (!is_retry)
603			goto try_to_map;
604		err = EIO;
605		goto err;
606	}
607	rl = ni->rl.rl;
608	if (vcn >= rl[0].vcn) {
609		while (rl->length) {
610			if (vcn < rl[1].vcn) {
611				if (rl->lcn >= LCN_HOLE) {
612					ntfs_debug("Done.");
613					*run = rl;
614					return 0;
615				}
616				break;
617			}
618			rl++;
619		}
620		if (rl->lcn != LCN_RL_NOT_MAPPED) {
621			if (rl->lcn == LCN_ENOENT)
622				err = ENOENT;
623			else
624				err = EIO;
625		}
626	}
627	if (!err && !is_retry) {
628		/*
629		 * If the search context is invalid we cannot map the unmapped
630		 * region.
631		 */
632		if (ctx->is_error)
633			err = ctx->error;
634		else {
635try_to_map:
636			/*
637			 * The @vcn is in an unmapped region, map the runlist
638			 * and retry.
639			 */
640			err = ntfs_map_runlist_nolock(ni, vcn, ctx);
641			if (!err) {
642				is_retry = TRUE;
643				goto retry_remap;
644			}
645		}
646		if (err == EINVAL)
647			err = EIO;
648	} else if (!err)
649		err = EIO;
650err:
651	if (err != ENOENT)
652		ntfs_error(ni->vol->mp, "Failed (error %d).", err);
653	return err;
654}
655
656/**
657 * ntfs_attr_search_ctx_reinit - reinitialize an attribute search context
658 * @ctx:	attribute search context to reinitialize
659 *
660 * Reinitialize the attribute search context @ctx, unmapping an associated
661 * extent mft record if present, and initialize the search context again.
662 *
663 * This is used when a search for a new attribute is being started to reset
664 * the search context to the beginning.
665 *
666 * Note: We preserve the content of @ctx->is_mft_locked so that reinitializing
667 * a search context can also be done when dealing with the mft itself.
668 */
669void ntfs_attr_search_ctx_reinit(ntfs_attr_search_ctx *ctx)
670{
671	const BOOL mft_is_locked = ctx->is_mft_locked;
672
673	if (!ctx->base_ni) {
674		/* No attribute list. */
675		ctx->is_first = 1;
676		ctx->is_iteration = 0;
677		/* Sanity checks are performed elsewhere. */
678		ctx->a = (ATTR_RECORD*)((u8*)ctx->m +
679				le16_to_cpu(ctx->m->attrs_offset));
680		/*
681		 * This needs resetting due to
682		 * ntfs_attr_find_in_attribute_list() which can leave it set
683		 * despite having zeroed ctx->base_ni.
684		 */
685		ctx->al_entry = NULL;
686		return;
687	}
688	/* Attribute list. */
689	if (ctx->ni != ctx->base_ni)
690		ntfs_extent_mft_record_unmap(ctx->ni);
691	ntfs_attr_search_ctx_init(ctx, ctx->base_ni, ctx->base_m);
692	if (mft_is_locked)
693		ctx->is_mft_locked = 1;
694}
695
696/**
697 * ntfs_attr_search_ctx_get - allocate and init a new attribute search context
698 * @ni:		ntfs inode with which to initialize the search context
699 * @m:		mft record with which to initialize the search context
700 *
701 * Allocate a new attribute search context, initialize it with @ni and @m, and
702 * return it.  Return NULL if allocation failed.
703 */
704ntfs_attr_search_ctx *ntfs_attr_search_ctx_get(ntfs_inode *ni, MFT_RECORD *m)
705{
706	ntfs_attr_search_ctx *ctx;
707
708	ctx = OSMalloc(sizeof(ntfs_attr_search_ctx), ntfs_malloc_tag);
709	if (ctx)
710		ntfs_attr_search_ctx_init(ctx, ni, m);
711	return ctx;
712}
713
714/**
715 * ntfs_attr_search_ctx_put - release an attribute search context
716 * @ctx:	attribute search context to free
717 *
718 * Release the attribute search context @ctx, unmapping an associated extent
719 * mft record if present.
720 */
721void ntfs_attr_search_ctx_put(ntfs_attr_search_ctx *ctx)
722{
723	if (ctx->base_ni && ctx->ni != ctx->base_ni)
724		ntfs_extent_mft_record_unmap(ctx->ni);
725	OSFree(ctx, sizeof(ntfs_attr_search_ctx), ntfs_malloc_tag);
726}
727
728/**
729 * ntfs_attr_find_in_mft_record - find (next) attribute in mft record
730 * @type:	attribute type to find
731 * @name:	attribute name to find (optional, i.e. NULL means do not care)
732 * @name_len:	attribute name length (only needed if @name present)
733 * @val:	attribute value to find (optional, resident attributes only)
734 * @val_len:	attribute value length (only needed if @val present)
735 * @ctx:	search context with mft record and attribute to search from
736 *
737 * You should not need to call this function directly.  Use ntfs_attr_lookup()
738 * instead.
739 *
740 * ntfs_attr_find_in_mft_record() takes a search context @ctx as parameter and
741 * searches the mft record specified by @ctx->m, beginning at @ctx->a, for an
742 * attribute of @type, optionally @name and @val.
743 *
744 * If the attribute is found, ntfs_attr_find_in_mft_record() returns 0 and
745 * @ctx->a is set to point to the found attribute.
746 *
747 * If the attribute is not found, ENOENT is returned and @ctx->a is set to
748 * point to the attribute before which the attribute being searched for would
749 * need to be inserted if such an action were to be desired.
750 *
751 * On actual error, ntfs_attr_find_in_mft_record() returns EIO.  In this case
752 * @ctx->a is undefined and in particular do not rely on it not having changed.
753 *
754 * If @ctx->is_first is 1, the search begins with @ctx->a itself.  If it is 0,
755 * the search begins after @ctx->a.
756 *
757 * If @ctx->is_iteration is 1 and @type is AT_UNUSED this is not a search but
758 * an iteration in which case each attribute in the mft record is returned in
759 * turn with each call to ntfs_attr_find_in_mft_record().  Note all attributes
760 * are returned including the attribute list attribute, unlike when
761 * @ctx->is_iteration is 0 when it is not returned unless it is specifically
762 * looked for.
763 *
764 * Similarly to the above, when @ctx->is_iterations is 1 and @type is not
765 * AT_UNUSED all attributes of type @type are returned one after the other.
766 *
767 * If @name is AT_UNNAMED search for an unnamed attribute.  If @name is present
768 * but not AT_UNNAMED search for a named attribute matching @name.  Otherwise,
769 * match both named and unnamed attributes.
770 *
771 * Finally, the resident attribute value @val is looked for, if present.  If
772 * @val is not present (NULL), @val_len is ignored.
773 *
774 * ntfs_attr_find_in_mft_record() only searches the specified mft record and it
775 * ignores the presence of an attribute list attribute (unless it is the one
776 * being searched for, obviously).  If you need to take attribute lists into
777 * consideration, use ntfs_attr_lookup() instead (see below).  This also means
778 * that you cannot use ntfs_attr_find_in_mft_record() to search for extent
779 * records of non-resident attributes, as extents with lowest_vcn != 0 are
780 * usually described by the attribute list attribute only.  Note that it is
781 * possible that the first extent is only in the attribute list while the last
782 * extent is in the base mft record, so do not rely on being able to find the
783 * first extent in the base mft record.
784 *
785 * Warning: Never use @val when looking for attribute types which can be
786 *	    non-resident as this most likely will result in a crash!
787 *
788 * Note if the volume is mounted case sensitive we treat attribute names as
789 * being case sensitive and vice versa if the volume is not mounted case
790 * sensitive we treat attribute names as being case insensitive also.
791 */
792errno_t ntfs_attr_find_in_mft_record(const ATTR_TYPE type,
793		const ntfschar *name, const u32 name_len,
794		const void *val, const u32 val_len, ntfs_attr_search_ctx *ctx)
795{
796	ATTR_RECORD *a;
797	ntfs_volume *vol = ctx->ni->vol;
798	const ntfschar *upcase = vol->upcase;
799	const u32 upcase_len = vol->upcase_len;
800	const BOOL case_sensitive = NVolCaseSensitive(vol);
801	const BOOL is_iteration = ctx->is_iteration;
802
803	/*
804	 * Iterate over attributes in mft record starting at @ctx->a, or the
805	 * attribute following that, if @ctx->is_first is true.
806	 */
807	if (ctx->is_first) {
808		a = ctx->a;
809		ctx->is_first = 0;
810	} else
811		a = (ATTR_RECORD*)((u8*)ctx->a + le32_to_cpu(ctx->a->length));
812	for (;;	a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) {
813		if ((u8*)a < (u8*)ctx->m || (u8*)a > (u8*)ctx->m +
814				le32_to_cpu(ctx->m->bytes_allocated))
815			break;
816		ctx->a = a;
817		if (((!is_iteration || type != AT_UNUSED) &&
818				le32_to_cpu(a->type) > le32_to_cpu(type)) ||
819				a->type == AT_END)
820			return ENOENT;
821		if (!a->length)
822			break;
823		if (is_iteration) {
824			if (type == AT_UNUSED || type == a->type)
825				return 0;
826		}
827		if (a->type != type)
828			continue;
829		/*
830		 * If @name is AT_UNNAMED we want an unnamed attribute.
831		 * If @name is present, compare the two names.
832		 * Otherwise, match any attribute.
833		 */
834		if (name == AT_UNNAMED) {
835			/* The search failed if the found attribute is named. */
836			if (a->name_length)
837				return ENOENT;
838		} else if (name) {
839			unsigned len, ofs;
840
841			len = a->name_length;
842			ofs = le16_to_cpu(a->name_offset);
843			if (ofs + (len * sizeof(ntfschar)) >
844					le32_to_cpu(a->length))
845				break;
846			if (!ntfs_are_names_equal(name, name_len,
847					(ntfschar*)((u8*)a + ofs), len,
848					case_sensitive, upcase, upcase_len)) {
849				int rc;
850
851				rc = ntfs_collate_names(name, name_len,
852						(ntfschar*)((u8*)a + ofs), len,
853						1, FALSE, upcase, upcase_len);
854				/*
855				 * If @name collates before a->name, there is
856				 * no matching attribute.
857				 */
858				if (rc == -1)
859					return ENOENT;
860				/*
861				 * If the strings are not equal, continue
862				 * searching.
863				 */
864				if (rc)
865					continue;
866				rc = ntfs_collate_names(name, name_len,
867						(ntfschar*)((u8*)a + ofs), len,
868						1, TRUE, upcase, upcase_len);
869				if (rc == -1)
870					return ENOENT;
871				if (rc)
872					continue;
873			}
874		}
875		/*
876		 * The names match or @name not present and attribute is
877		 * unnamed.  If no @val specified, we have found the attribute
878		 * and are done.
879		 */
880		if (!val)
881			return 0;
882		/* @val is present; compare values. */
883		else {
884			unsigned len, ofs;
885			int rc;
886
887			len = le32_to_cpu(a->value_length);
888			ofs = le16_to_cpu(a->value_offset);
889			if (ofs + len > le32_to_cpu(a->length))
890				break;
891			rc = memcmp(val, (u8*)a + ofs,
892					len <= val_len ? len : val_len);
893			/*
894			 * If @val collates before the value of the current
895			 * attribute, there is no matching attribute.
896			 */
897			if (!rc) {
898				if (val_len == len)
899					return 0;
900				if (val_len < len)
901					return ENOENT;
902			} else if (rc < 0)
903				return ENOENT;
904		}
905	}
906	ntfs_error(vol->mp, "Inode is corrupt.  Run chkdsk.");
907	NVolSetErrors(vol);
908	return EIO;
909}
910
911/**
912 * ntfs_attr_find_in_attribute_list - find an attribute in the attribute list
913 * @type:	attribute type to find
914 * @name:	attribute name to find (optional, i.e. NULL means do not care)
915 * @name_len:	attribute name length (only needed if @name present)
916 * @lowest_vcn:	lowest vcn to find (optional, non-resident attributes only)
917 * @val:	attribute value to find (optional, resident attributes only)
918 * @val_len:	attribute value length (only needed if @val present)
919 * @ctx:	search context with mft record and attribute to search from
920 *
921 * You should not need to call this function directly.  Use ntfs_attr_lookup()
922 * instead.
923 *
924 * Find an attribute by searching the attribute list for the corresponding
925 * attribute list entry.  Having found the entry, map the mft record if the
926 * attribute is in a different mft record/inode, ntfs_attr_find_in_mft_record()
927 * the attribute in there and return it.
928 *
929 * On first search @ctx->ni must be the base mft record and @ctx must have been
930 * obtained from a call to ntfs_attr_search_ctx_get().  On subsequent calls
931 * @ctx->ni can be any extent inode, too (@ctx->base_ni is then the base
932 * inode).
933 *
934 * After finishing with the attribute/mft record you need to call
935 * ntfs_attr_search_ctx_put() to clean up the search context (unmapping any
936 * mapped mft records, etc).
937 *
938 * If the attribute is found, ntfs_attr_find_in_attribute_list() returns 0 and
939 * @ctx->a is set to point to the found attribute.  @ctx->m is set to point to
940 * the mft record in which @ctx->a is located and @ctx->al_entry is set to
941 * point to the attribute list entry for the attribute.
942 *
943 * If the attribute is not found, ENOENT is returned and @ctx->a is set to
944 * point to the attribute in the base mft record before which the attribute
945 * being searched for would need to be inserted if such an action were to be
946 * desired.  @ctx->m is set to point to the mft record in which @ctx->a is
947 * located, i.e. the base mft record, and @ctx->al_entry is set to point to the
948 * attribute list entry of the attribute before which the attribute being
949 * searched for would need to be inserted if such an action were to be desired.
950 *
951 * Thus to insert the not found attribute, one wants to add the attribute to
952 * @ctx->m (the base mft record) and if there is not enough space, the
953 * attribute should be placed in a newly allocated extent mft record.  The
954 * attribute list entry for the inserted attribute should be inserted in the
955 * attribute list attribute at @ctx->al_entry.
956 *
957 * On actual error, ntfs_attr_find_in_attribute_list() returns EIO.  In this
958 * case @ctx->a is undefined and in particular do not rely on it not having
959 * changed.
960 *
961 * If @ctx->is_first is 1, the search begins with @ctx->a itself.  If it is 0,
962 * the search begins after @ctx->a.
963 *
964 * If @name is AT_UNNAMED search for an unnamed attribute.  If @name is present
965 * but not AT_UNNAMED search for a named attribute matching @name.  Otherwise,
966 * match both named and unnamed attributes.
967 *
968 * Finally, the resident attribute value @val is looked for, if present.  If
969 * @val is not present (NULL), @val_len is ignored.
970 *
971 * Warning: Never use @val when looking for attribute types which can be
972 *	    non-resident as this most likely will result in a crash!
973 */
974static errno_t ntfs_attr_find_in_attribute_list(const ATTR_TYPE type,
975		const ntfschar *name, const u32 name_len, const VCN lowest_vcn,
976		const void *val, const u32 val_len, ntfs_attr_search_ctx *ctx)
977{
978	ntfs_inode *base_ni, *ni = ctx->ni;
979	ntfs_volume *vol = ni->vol;
980	ATTR_LIST_ENTRY *al_entry, *next_al_entry;
981	u8 *al_start, *al_end;
982	ATTR_RECORD *a;
983	ntfschar *al_name;
984	const ntfschar *upcase = vol->upcase;
985	const u32 upcase_len = vol->upcase_len;
986	u32 al_name_len;
987	errno_t err = 0;
988	static const char es[] = " Unmount and run chkdsk.";
989	const BOOL case_sensitive = NVolCaseSensitive(vol);
990
991	if (ctx->is_iteration)
992		panic("%s(): ctx->is_iteration\n", __FUNCTION__);
993	base_ni = ctx->base_ni;
994	ntfs_debug("Entering for mft_no 0x%llx, type 0x%x.",
995			(unsigned long long)ni->mft_no, le32_to_cpu(type));
996	if (!base_ni) {
997		/* First call happens with the base mft record. */
998		base_ni = ctx->base_ni = ctx->ni;
999		ctx->base_m = ctx->m;
1000	}
1001	if (ni == base_ni)
1002		ctx->base_a = ctx->a;
1003	if (type == AT_END)
1004		goto not_found;
1005	al_start = base_ni->attr_list;
1006	al_end = al_start + base_ni->attr_list_size;
1007	if (!ctx->al_entry)
1008		ctx->al_entry = (ATTR_LIST_ENTRY*)al_start;
1009	/*
1010	 * Iterate over entries in attribute list starting at @ctx->al_entry,
1011	 * or the entry following that, depending on the value of
1012	 * @ctx->is_first.
1013	 */
1014	if (ctx->is_first) {
1015		al_entry = ctx->al_entry;
1016		ctx->is_first = 0;
1017	} else
1018		al_entry = (ATTR_LIST_ENTRY*)((u8*)ctx->al_entry +
1019				le16_to_cpu(ctx->al_entry->length));
1020	for (;; al_entry = next_al_entry) {
1021		/* Out of bounds check. */
1022		if ((u8*)al_entry < base_ni->attr_list ||
1023				(u8*)al_entry > al_end)
1024			break;	/* Inode is corrupt. */
1025		ctx->al_entry = al_entry;
1026		/* Catch the end of the attribute list. */
1027		if ((u8*)al_entry == al_end)
1028			goto not_found;
1029		if (!al_entry->length)
1030			break;
1031		if ((u8*)al_entry + 6 > al_end || (u8*)al_entry +
1032				le16_to_cpu(al_entry->length) > al_end)
1033			break;
1034		next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
1035				le16_to_cpu(al_entry->length));
1036		if (al_entry->type != type) {
1037			if (le32_to_cpu(al_entry->type) < le32_to_cpu(type))
1038				continue;
1039			goto not_found;
1040		}
1041		/*
1042		 * If @name is AT_UNNAMED we want an unnamed attribute.
1043		 * If @name is present, compare the two names.
1044		 * Otherwise, match any attribute.
1045		 */
1046		al_name_len = al_entry->name_length;
1047		al_name = (ntfschar*)((u8*)al_entry + al_entry->name_offset);
1048		if (name == AT_UNNAMED) {
1049			if (al_name_len)
1050				goto not_found;
1051		} else if (name && !ntfs_are_names_equal(al_name, al_name_len,
1052				name, name_len, case_sensitive, upcase,
1053				upcase_len)) {
1054			int rc;
1055
1056			rc = ntfs_collate_names(name, name_len, al_name,
1057					al_name_len, 1, FALSE,
1058					upcase, upcase_len);
1059			/*
1060			 * If @name collates before al_name, there is no
1061			 * matching attribute.
1062			 */
1063			if (rc == -1)
1064				goto not_found;
1065			/* If the strings are not equal, continue search. */
1066			if (rc)
1067				continue;
1068			/*
1069			 * FIXME: Reverse engineering showed 0, IGNORE_CASE but
1070			 * that would be inconsistent with
1071			 * ntfs_attr_find_in_mft_record().  The subsequent rc
1072			 * checks were also different.  Perhaps I made a
1073			 * mistake in one of the two.  Need to recheck which is
1074			 * correct or at least see what is going on...
1075			 */
1076			rc = ntfs_collate_names(name, name_len, al_name,
1077					al_name_len, 1, TRUE,
1078					vol->upcase, vol->upcase_len);
1079			if (rc == -1)
1080				goto not_found;
1081			if (rc)
1082				continue;
1083		}
1084		/*
1085		 * The names match or @name not present and attribute is
1086		 * unnamed.  Now check @lowest_vcn.  Continue search if the
1087		 * next attribute list entry still fits @lowest_vcn.  Otherwise
1088		 * we have reached the right one or the search has failed.
1089		 */
1090		if (lowest_vcn && (u8*)next_al_entry >= al_start &&
1091				(u8*)next_al_entry + 6 < al_end &&
1092				(u8*)next_al_entry + le16_to_cpu(
1093					next_al_entry->length) <= al_end &&
1094				sle64_to_cpu(next_al_entry->lowest_vcn) <=
1095					lowest_vcn &&
1096				next_al_entry->type == al_entry->type &&
1097				next_al_entry->name_length == al_name_len &&
1098				ntfs_are_names_equal((ntfschar*)((u8*)
1099					next_al_entry +
1100					next_al_entry->name_offset),
1101					next_al_entry->name_length,
1102					al_name, al_name_len, case_sensitive,
1103					vol->upcase, vol->upcase_len))
1104			continue;
1105		if (MREF_LE(al_entry->mft_reference) == ni->mft_no) {
1106			if (MSEQNO_LE(al_entry->mft_reference) != ni->seq_no) {
1107				ntfs_error(vol->mp, "Found stale mft "
1108						"reference in attribute list "
1109						"of base inode 0x%llx.%s",
1110						(unsigned long long)
1111						base_ni->mft_no, es);
1112				err = EIO;
1113				break;
1114			}
1115		} else { /* Mft references do not match. */
1116			/* If there is a mapped record unmap it first. */
1117			if (ni != base_ni)
1118				ntfs_extent_mft_record_unmap(ni);
1119			/* Do we want the base record back? */
1120			if (MREF_LE(al_entry->mft_reference) ==
1121					base_ni->mft_no) {
1122				ni = ctx->ni = base_ni;
1123				ctx->m = ctx->base_m;
1124			} else {
1125				/* We want an extent record. */
1126				err = ntfs_extent_mft_record_map_ext(base_ni,
1127						le64_to_cpu(
1128						al_entry->mft_reference), &ni,
1129						&ctx->m, ctx->is_mft_locked);
1130				if (err) {
1131					ntfs_error(vol->mp, "Failed to map "
1132							"extent mft record "
1133							"0x%llx of base inode "
1134							"0x%llx.%s",
1135							(unsigned long long)
1136							MREF_LE(al_entry->
1137							mft_reference),
1138							(unsigned long long)
1139							base_ni->mft_no, es);
1140					if (err == ENOENT)
1141						err = EIO;
1142					/* Cause @ctx to be sanitized below. */
1143					ni = NULL;
1144					break;
1145				}
1146				ctx->ni = ni;
1147			}
1148		}
1149		a = ctx->a = (ATTR_RECORD*)((u8*)ctx->m +
1150				le16_to_cpu(ctx->m->attrs_offset));
1151		/*
1152		 * ctx->ni, ctx->m, and ctx->a now point to the mft record
1153		 * containing the attribute represented by the current
1154		 * al_entry.
1155		 *
1156		 * We could call into ntfs_attr_find_in_mft_record() to find
1157		 * the right attribute in this mft record but this would be
1158		 * less efficient and not quite accurate as it ignores the
1159		 * attribute instance numbers for example which become
1160		 * important when one plays with attribute lists.  Also,
1161		 * because a proper match has been found in the attribute list
1162		 * entry above, the comparison can now be optimized.  So it is
1163		 * worth re-implementing a simplified
1164		 * ntfs_attr_find_in_mft_record() here.
1165		 *
1166		 * Use a manual loop so we can still use break and continue
1167		 * with the same meanings as above.
1168		 */
1169do_next_attr_loop:
1170		if ((u8*)a < (u8*)ctx->m || (u8*)a > (u8*)ctx->m +
1171				le32_to_cpu(ctx->m->bytes_allocated))
1172			break;
1173		if (a->type == AT_END)
1174			continue;
1175		if (!a->length)
1176			break;
1177		if (al_entry->instance != a->instance)
1178			goto do_next_attr;
1179		/*
1180		 * If the type and/or the name are mismatched between the
1181		 * attribute list entry and the attribute record, there is
1182		 * corruption so we break and return error EIO.
1183		 */
1184		if (al_entry->type != a->type)
1185			break;
1186		if (!ntfs_are_names_equal((ntfschar*)((u8*)a +
1187				le16_to_cpu(a->name_offset)), a->name_length,
1188				al_name, al_name_len, case_sensitive,
1189				vol->upcase, vol->upcase_len))
1190			break;
1191		ctx->a = a;
1192		/*
1193		 * If no @val specified or @val specified and it matches, we
1194		 * have found it!
1195		 */
1196		if (!val || (!a->non_resident &&
1197				le32_to_cpu(a->value_length) == val_len &&
1198				!bcmp((u8*)a + le16_to_cpu(a->value_offset),
1199				val, val_len))) {
1200			ntfs_debug("Done, found.");
1201			return 0;
1202		}
1203do_next_attr:
1204		/* Proceed to the next attribute in the current mft record. */
1205		a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length));
1206		goto do_next_attr_loop;
1207	}
1208	if (!err) {
1209		ntfs_error(vol->mp, "Base inode 0x%llx contains corrupt "
1210				"attribute list attribute.%s",
1211				(unsigned long long)base_ni->mft_no, es);
1212		err = EIO;
1213	}
1214	if (ni != base_ni) {
1215		if (ni)
1216			ntfs_extent_mft_record_unmap(ni);
1217		ctx->ni = base_ni;
1218		ctx->m = ctx->base_m;
1219		ctx->a = ctx->base_a;
1220	}
1221	if (err != ENOMEM)
1222		NVolSetErrors(vol);
1223	return err;
1224not_found:
1225	/*
1226	 * If we were looking for AT_END, we reset the search context @ctx and
1227	 * use ntfs_attr_find_in_mft_record() to seek to the end of the base
1228	 * mft record.
1229	 */
1230	if (type == AT_END) {
1231		ntfs_attr_search_ctx_reinit(ctx);
1232		return ntfs_attr_find_in_mft_record(AT_END, NULL, 0, NULL, 0,
1233				ctx);
1234	}
1235	/*
1236	 * The attribute was not found.  Before we return, we want to ensure
1237	 * @ctx->m and @ctx->a indicate the position at which the attribute
1238	 * should be inserted in the base mft record.  Since we also want to
1239	 * preserve @ctx->al_entry we cannot reinitialize the search context
1240	 * using ntfs_attr_search_ctx_reinit() as this would set @ctx->al_entry
1241	 * to NULL.  Thus we do the necessary bits manually (see
1242	 * ntfs_attr_search_ctx_init() above).  Note, we postpone setting
1243	 * @base_a until after the call to ntfs_attr_find_in_mft_record() as we
1244	 * do not know the correct value yet.
1245	 */
1246	if (ni != base_ni)
1247		ntfs_extent_mft_record_unmap(ni);
1248	ctx->m = ctx->base_m;
1249	ctx->a = (ATTR_RECORD*)((u8*)ctx->m +
1250			le16_to_cpu(ctx->m->attrs_offset));
1251	ctx->is_first = 1;
1252	ctx->ni = base_ni;
1253	/*
1254	 * In case there are multiple matches in the base mft record, need to
1255	 * keep enumerating until we get an attribute not found response (or
1256	 * another error), otherwise we would keep returning the same attribute
1257	 * over and over again and all programs using us for enumeration would
1258	 * lock up in a tight loop.
1259	 */
1260	do {
1261		err = ntfs_attr_find_in_mft_record(type, name, name_len,
1262				val, val_len, ctx);
1263	} while (!err);
1264	ctx->base_a = ctx->a;
1265	ntfs_debug("Done, not found.");
1266	return err;
1267}
1268
1269/**
1270 * ntfs_attr_lookup - find an attribute in an ntfs inode
1271 * @type:	attribute type to find
1272 * @name:	attribute name to find (optional, i.e. NULL means do not care)
1273 * @name_len:	attribute name length (only needed if @name present)
1274 * @lowest_vcn:	lowest vcn to find (optional, non-resident attributes only)
1275 * @val:	attribute value to find (optional, resident attributes only)
1276 * @val_len:	attribute value length (only needed if @val present)
1277 * @ctx:	search context with mft record and attribute to search from
1278 *
1279 * Find an attribute in an ntfs inode.  On first search @ctx->ni must be the
1280 * base mft record and @ctx must have been obtained from a call to
1281 * ntfs_attr_search_ctx_get().
1282 *
1283 * This function transparently handles attribute lists and @ctx is used to
1284 * continue searches where they were left off at.
1285 *
1286 * After finishing with the attribute/mft record you need to call
1287 * ntfs_attr_search_ctx_put() to clean up the search context (unmapping any
1288 * mapped mft records, etc).
1289 *
1290 * Return 0 if the search was successful and errno if not.
1291 *
1292 * On success, @ctx->a is the found attribute and it is in mft record @ctx->m.
1293 * If an attribute list attribute is present, @ctx->al_entry is the attribute
1294 * list entry of the found attribute.
1295 *
1296 * On error ENOENT, @ctx->a is the attribute which collates just after the
1297 * attribute being searched for, i.e. if one wants to add the attribute to the
1298 * mft record this is the correct place to insert it into.  If an attribute
1299 * list attribute is present, @ctx->al_entry is the attribute list entry which
1300 * collates just after the attribute list entry of the attribute being searched
1301 * for, i.e. if one wants to add the attribute to the mft record this is the
1302 * correct place to insert its attribute list entry into.
1303 *
1304 * When errno != ENOENT, an error occured during the lookup.  @ctx->a is then
1305 * undefined and in particular you should not rely on it not having changed.
1306 *
1307 * Warning: Never use @val when looking for attribute types which can be
1308 *	    non-resident as this most likely will result in a crash!
1309 */
1310errno_t ntfs_attr_lookup(const ATTR_TYPE type,
1311		const ntfschar *name, const u32 name_len, const VCN lowest_vcn,
1312		const void *val, const u32 val_len, ntfs_attr_search_ctx *ctx)
1313{
1314	ntfs_inode *base_ni;
1315
1316	ntfs_debug("Entering.");
1317	if (ctx->base_ni)
1318		base_ni = ctx->base_ni;
1319	else
1320		base_ni = ctx->ni;
1321	/* Sanity check, just for debugging really. */
1322	if (!base_ni)
1323		panic("%s(): !base_ni\n", __FUNCTION__);
1324	if (!NInoAttrList(base_ni) || type == AT_ATTRIBUTE_LIST)
1325		return ntfs_attr_find_in_mft_record(type, name, name_len,
1326				val, val_len, ctx);
1327	if (ctx->is_iteration)
1328		panic("%s(): ctx->is_iteration\n", __FUNCTION__);
1329	return ntfs_attr_find_in_attribute_list(type, name, name_len,
1330			lowest_vcn, val, val_len, ctx);
1331}
1332
1333/**
1334 * ntfs_attr_find_in_attrdef - find an attribute in the $AttrDef system file
1335 * @vol:	ntfs volume to which the attribute belongs
1336 * @type:	attribute type which to find
1337 *
1338 * Search for the attribute definition record corresponding to the attribute
1339 * @type in the $AttrDef system file.
1340 *
1341 * Return the attribute type definition record if found and NULL if not found.
1342 */
1343static ATTR_DEF *ntfs_attr_find_in_attrdef(const ntfs_volume *vol,
1344		const ATTR_TYPE type)
1345{
1346	ATTR_DEF *ad;
1347
1348	if (!vol->attrdef)
1349		panic("%s(): !vol->attrdef\n", __FUNCTION__);
1350	if (!type)
1351		panic("%s(): !type\n", __FUNCTION__);
1352	for (ad = vol->attrdef; (u8*)ad - (u8*)vol->attrdef <
1353			vol->attrdef_size && ad->type; ++ad) {
1354		/* If we have not found it yet, carry on searching. */
1355		if (le32_to_cpu(type) > le32_to_cpu(ad->type))
1356			continue;
1357		/* If we have found the attribute, return it. */
1358		if (type == ad->type)
1359			return ad;
1360		/* We have gone too far already.  No point in continuing. */
1361		break;
1362	}
1363	/* Attribute not found. */
1364	ntfs_debug("Attribute type 0x%x not found in $AttrDef.",
1365			le32_to_cpu(type));
1366	return NULL;
1367}
1368
1369/**
1370 * ntfs_attr_size_bounds_check - check a size of an attribute type for validity
1371 * @vol:	ntfs volume to which the attribute belongs
1372 * @type:	attribute type which to check
1373 * @size:	size which to check
1374 *
1375 * Check whether the @size in bytes is valid for an attribute of @type on the
1376 * ntfs volume @vol.  This information is obtained from $AttrDef system file.
1377 *
1378 * Return 0 if valid, ERANGE if not valid, and ENOENT if the attribute is not
1379 * listed in $AttrDef.
1380 */
1381errno_t ntfs_attr_size_bounds_check(const ntfs_volume *vol,
1382		const ATTR_TYPE type, const s64 size)
1383{
1384	ATTR_DEF *ad;
1385
1386	if (size < 0)
1387		panic("%s(): size < 0\n", __FUNCTION__);
1388	/*
1389	 * $ATTRIBUTE_LIST has a maximum size of 256kiB, but this is not
1390	 * listed in $AttrDef.
1391	 */
1392	if (type == AT_ATTRIBUTE_LIST && size > NTFS_MAX_ATTR_LIST_SIZE)
1393		return ERANGE;
1394	/* Get the $AttrDef entry for the attribute @type. */
1395	ad = ntfs_attr_find_in_attrdef(vol, type);
1396	if (!ad)
1397		return ENOENT;
1398	/* Do the bounds check. */
1399	if ((sle64_to_cpu(ad->min_size) > 0 &&
1400			size < sle64_to_cpu(ad->min_size)) ||
1401			(sle64_to_cpu(ad->max_size) > 0 &&
1402			size > sle64_to_cpu(ad->max_size)) ||
1403			(u64)size > NTFS_MAX_ATTRIBUTE_SIZE)
1404		return ERANGE;
1405	return 0;
1406}
1407
1408/**
1409 * ntfs_attr_can_be_non_resident - check if an attribute can be non-resident
1410 * @vol:	ntfs volume to which the attribute belongs
1411 * @type:	attribute type which to check
1412 *
1413 * Check whether the attribute of @type on the ntfs volume @vol is allowed to
1414 * be non-resident.  This information is obtained from $AttrDef system file.
1415 *
1416 * Return 0 if the attribute is allowed to be non-resident, EPERM if not, and
1417 * ENOENT if the attribute is not listed in $AttrDef.
1418 */
1419static errno_t ntfs_attr_can_be_non_resident(const ntfs_volume *vol,
1420		const ATTR_TYPE type)
1421{
1422	ATTR_DEF *ad;
1423
1424	/* Find the attribute definition record in $AttrDef. */
1425	ad = ntfs_attr_find_in_attrdef(vol, type);
1426	if (!ad)
1427		return ENOENT;
1428	/* Check the flags and return the result. */
1429	if (ad->flags & ATTR_DEF_RESIDENT)
1430		return EPERM;
1431	return 0;
1432}
1433
1434/**
1435 * ntfs_attr_can_be_resident - check if an attribute can be resident
1436 * @vol:	ntfs volume to which the attribute belongs
1437 * @type:	attribute type which to check
1438 *
1439 * Check whether the attribute of @type on the ntfs volume @vol is allowed to
1440 * be resident.  This information is derived from our ntfs knowledge and may
1441 * not be completely accurate, especially when user defined attributes are
1442 * present.  Basically we allow everything to be resident except for index
1443 * allocation attributes.
1444 *
1445 * Return 0 if the attribute is allowed to be resident and EPERM if not.
1446 *
1447 * Warning: In the system file $MFT the attribute $Bitmap must be non-resident
1448 *	    otherwise windows will not boot (blue screen of death)!  We cannot
1449 *	    check for this here as we do not know which inode's $Bitmap is
1450 *	    being asked about so the caller needs to special case this.
1451 */
1452errno_t ntfs_attr_can_be_resident(const ntfs_volume *vol, const ATTR_TYPE type)
1453{
1454	if (type == AT_INDEX_ALLOCATION)
1455		return EPERM;
1456	return 0;
1457}
1458
1459/**
1460 * ntfs_attr_record_is_only_one - check if an attribute is the only one
1461 * @m:		the mft record in which the attribute to check resides
1462 * @a:		the attribute to check
1463 *
1464 * Check if the attribute @a is the only attribute record in its mft record @m.
1465 *
1466 * Return true if @a is the only attribute record in its mft record @m and
1467 * false if @a is not the only attribute record in its mft record @m.
1468 */
1469BOOL ntfs_attr_record_is_only_one(MFT_RECORD *m, ATTR_RECORD *a)
1470{
1471	ATTR_RECORD *first_a, *next_a;
1472
1473	first_a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset));
1474	next_a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length));
1475	return (first_a == a && next_a->type == AT_END);
1476}
1477
1478/**
1479 * ntfs_attr_record_delete_internal - delete attribute record from mft record
1480 * @m:		mft record containing attribute record to delete
1481 * @a:		attribute record to delete
1482 *
1483 * Delete the attribute record @a, i.e. the resident part of the attribute,
1484 * from the mft record @m.
1485 *
1486 * This function cannot fail.
1487 *
1488 * Note the caller is responsible for marking the mft record dirty after
1489 * calling this function.
1490 */
1491void ntfs_attr_record_delete_internal(MFT_RECORD *m, ATTR_RECORD *a)
1492{
1493	const u32 new_muse = le32_to_cpu(m->bytes_in_use) -
1494			le32_to_cpu(a->length);
1495	/* Move attributes following @a into the position of @a. */
1496	memmove(a, (u8*)a + le32_to_cpu(a->length),
1497			new_muse - ((u8*)a - (u8*)m));
1498	/* Adjust @m to reflect the change in used space. */
1499	m->bytes_in_use = cpu_to_le32(new_muse);
1500}
1501
1502/**
1503 * ntfs_attr_record_delete - delete an attribute record from its mft record
1504 * @base_ni:	base ntfs inode from which to delete the attribute
1505 * @ctx:	attribute search context describing attribute record to delete
1506 *
1507 * Delete the attribute record, i.e. the resident part of the attribute,
1508 * described by @ctx->a from its mft record @ctx->m and mark the mft record
1509 * dirty so it gets written out later.
1510 *
1511 * In an attribute list attribute is present also remove the attribute list
1512 * attribute entry corresponding to the attribute being deleted and update
1513 * the attribute list attribute record accordingly.
1514 *
1515 * If the only attribute in the mft record is the attribute being deleted then
1516 * instead of deleting the attribute we free the extent mft record altogether
1517 * taking care to disconnect it from the base ntfs inode in the process.  As
1518 * above we update the attribute list attribute accordingly.
1519 *
1520 * If we end up freeing the extent mft record we go on to check the attribute
1521 * list attribute and if it no longer references any extent mft records we
1522 * remove the attribute list attribute altogether and update the base ntfs
1523 * inode to reflect the changed inode state.
1524 *
1525 * Return 0 on success and the error code on error.
1526 *
1527 * Note that on success the attribute search record is no longer valid and the
1528 * caller must either release it by calling ntfs_attr_search_ctx_put() or
1529 * reinitialize it by calling ntfs_attr_search_ctx_reinit().  Looking at the
1530 * search context or using it to call other functions would have unpredictable
1531 * results and could lead to crashes and file system corruption.
1532 */
1533errno_t ntfs_attr_record_delete(ntfs_inode *base_ni, ntfs_attr_search_ctx *ctx)
1534{
1535	ntfs_inode *ni;
1536	MFT_RECORD *m;
1537	ATTR_RECORD *a;
1538	ATTR_LIST_ENTRY *al_entry;
1539	errno_t err;
1540	unsigned al_ofs;
1541	BOOL al_needed;
1542
1543	ni = ctx->ni;
1544	m = ctx->m;
1545	a = ctx->a;
1546	ntfs_debug("Entering for attribute type 0x%x located in %s mft "
1547			"record 0x%llx.  Attribute list attribute is "
1548			"%spresent.", (unsigned)le32_to_cpu(a->type),
1549			(base_ni == ni) ? "base" : "extent",
1550			(unsigned long long)ni->mft_no,
1551			NInoAttrList(base_ni) ? "" : "not ");
1552	/*
1553	 * If there is no attribute list attribute, the mft record must be a
1554	 * base mft record and thus it cannot be becoming empty as a
1555	 * consequence of deleting the attribute record.  Thus for inodes
1556	 * without an attribute list attribute we have a fast path of simply
1557	 * going ahead and deleting the attribute record and returning.
1558	 */
1559	if (!NInoAttrList(base_ni)) {
1560		ntfs_attr_record_delete_internal(m, a);
1561		NInoSetMrecNeedsDirtying(base_ni);
1562		ntfs_debug("Done (no attribute list attribute).");
1563		return 0;
1564	}
1565	if (a->type == AT_ATTRIBUTE_LIST)
1566		panic("%s(): a->type == AT_ATTRIBUTE_LIST\n", __FUNCTION__);
1567	al_entry = ctx->al_entry;
1568	if (!al_entry)
1569		panic("%s(): !al_entry\n", __FUNCTION__);
1570	/*
1571	 * We have an attribute list attribute.  To begin with check if the
1572	 * attribute to be deleted is in the base mft record or if it is not
1573	 * the only attribute in the extent mft record.  In both of these cases
1574	 * we need to delete the attribute record from its mft record.
1575	 *
1576	 * Otherwise the attribute to be deleted is in an extent mft record and
1577	 * it is the only attribute in the extent mft record thus we need to
1578	 * free the extent mft record instead of deleting the attribute record.
1579	 */
1580	if (base_ni == ni || (u8*)m + le16_to_cpu(m->attrs_offset) != (u8*)a ||
1581			((ATTR_RECORD*)((u8*)a +
1582			le32_to_cpu(a->length)))->type != AT_END) {
1583		ntfs_attr_record_delete_internal(m, a);
1584		/*
1585		 * If the attribute was not in the base mft record mark the
1586		 * extent mft record dirty so it gets written out later.  If
1587		 * the attribute was in the base mft record it will be marked
1588		 * dirty later when the attribute list attribute record is
1589		 * updated which is in the base mft record by definition.
1590		 *
1591		 * We also unmap the extent mft record so we get to the same
1592		 * state as in the above case where we freed the extent mft
1593		 * record and we set @ctx->ni to equal the base inode @base_ni
1594		 * so that the search context is initialized from scratch or
1595		 * simply freed if the caller reinitializes or releases the
1596		 * search context respectively.
1597		 */
1598		if (base_ni != ni) {
1599			NInoSetMrecNeedsDirtying(ni);
1600			ntfs_extent_mft_record_unmap(ni);
1601			ctx->ni = base_ni;
1602		}
1603	} else {
1604		err = ntfs_extent_mft_record_free(base_ni, ni, m);
1605		if (err) {
1606			/*
1607			 * Ignore the error as we just end up with an unused
1608			 * mft record that is marked in use.
1609			 */
1610			ntfs_error(ni->vol->mp, "Failed to free extent mft_no "
1611					"0x%llx (error %d).  Unmount and run "
1612					"chkdsk to recover the lost inode.",
1613					(unsigned long long)ni->mft_no, err);
1614			NVolSetErrors(ni->vol);
1615			/*
1616			 * Relese the extent mft record after dirtying it thus
1617			 * simulating the effect of freeing it.
1618			 */
1619			NInoSetMrecNeedsDirtying(ni);
1620			ntfs_extent_mft_record_unmap(ni);
1621		}
1622		/*
1623		 * The attribute search context still points to the no longer
1624		 * mapped extent inode thus we need to change it to point to
1625		 * the base inode instead so the context can be reinitialized
1626		 * or released safely.
1627		 */
1628		ctx->ni = base_ni;
1629		/*
1630		 * Check the attribute list attribute.  If there are no other
1631		 * attribute list attribute entries referencing extent mft
1632		 * records delete the attribute list attribute altogether.
1633		 *
1634		 * If this fails it does not matter as we simply retain the
1635		 * attribute list attribute so we ignore the error and go on to
1636		 * delete the attribute list attribute entry instead.
1637		 *
1638		 * If there are other attribute list attribute entries
1639		 * referencing extent mft records we still need the attribute
1640		 * list attribute thus we go on to delete the attribute list
1641		 * entry corresponding to the attribute record we just deleted
1642		 * by freeing its extent mft record.
1643		 */
1644		err = ntfs_attr_list_is_needed(base_ni, al_entry, &al_needed);
1645		if (err)
1646			ntfs_warning(ni->vol->mp, "Failed to determine if "
1647					"attribute list attribute of mft_no "
1648					"0x%llx if still needed (error %d).  "
1649					"Assuming it is still needed and "
1650					"continuing.",
1651					(unsigned long long)base_ni->mft_no,
1652					err);
1653		else if (!al_needed) {
1654			/*
1655			 * No more extent mft records are in use.  Delete the
1656			 * attribute list attribute.
1657			 */
1658			ntfs_attr_search_ctx_reinit(ctx);
1659			err = ntfs_attr_list_delete(base_ni, ctx);
1660			if (!err) {
1661				/*
1662				 * We deleted the attribute list attribute and
1663				 * this will have updated the base inode
1664				 * appropriately thus we are done.
1665				 */
1666				ntfs_debug("Done (deleted attribute list "
1667						"attribute).");
1668				return 0;
1669			}
1670			ntfs_warning(ni->vol->mp, "Failed to delete attribute "
1671					"list attribute of mft_no 0x%llx "
1672					"(error %d).  Continuing by trying to "
1673					"delete the attribute list entry of "
1674					"the deleted attribute instead.",
1675					(unsigned long long)base_ni->mft_no,
1676					err);
1677		}
1678	}
1679	/*
1680	 * Both @ctx and @ni are now invalid and cannot be used any more which
1681	 * is fine as we have finished dealing with the attribute record.
1682	 *
1683	 * We now need to delete the corresponding attribute list attribute
1684	 * entry.
1685	 */
1686	al_ofs = (u8*)al_entry - base_ni->attr_list;
1687	ntfs_attr_list_entry_delete(base_ni, al_entry);
1688	ntfs_attr_search_ctx_reinit(ctx);
1689	err = ntfs_attr_list_sync_shrink(base_ni, al_ofs, ctx);
1690	if (!err) {
1691		ntfs_debug("Done (deleted attribute list attribute entry).");
1692		return 0;
1693	}
1694	NInoSetMrecNeedsDirtying(base_ni);
1695	ntfs_error(ni->vol->mp, "Failed to delete attribute list attribute "
1696			"entry in base mft_no 0x%llx (error %d).  Leaving "
1697			"inconsistent metadata.  Unmount and run chkdsk.",
1698			(unsigned long long)base_ni->mft_no, err);
1699	NVolSetErrors(ni->vol);
1700	return err;
1701}
1702
1703/**
1704 * ntfs_attr_record_make_space - make space for a new attribute record
1705 * @m:		mft record in which to make space for the new attribute record
1706 * @a:		attribute record in front of which to make space
1707 * @size:	byte size of the new attribute record for which to make space
1708 *
1709 * Make space for a new attribute record of size @size in the mft record @m, in
1710 * front of the existing attribute record @a.
1711 *
1712 * Return 0 on success and errno on error.  The following error codes are
1713 * defined:
1714 *	ENOSPC - Not enough space in the mft record @m.
1715 *
1716 * Note: On error, no modifications have been performed whatsoever.
1717 */
1718errno_t ntfs_attr_record_make_space(MFT_RECORD *m, ATTR_RECORD *a, u32 size)
1719{
1720	u32 new_muse;
1721	const u32 muse = le32_to_cpu(m->bytes_in_use);
1722	/* Align to 8 bytes if it is not already done. */
1723	if (size & 7)
1724		size = (size + 7) & ~7;
1725	new_muse = muse + size;
1726	/* Not enough space in this mft record. */
1727	if (new_muse > le32_to_cpu(m->bytes_allocated))
1728		return ENOSPC;
1729	/* Move attributes starting with @a to make space of @size bytes. */
1730	memmove((u8*)a + size, a, muse - ((u8*)a - (u8*)m));
1731	/* Adjust @m to reflect the change in used space. */
1732	m->bytes_in_use = cpu_to_le32(new_muse);
1733	/* Clear the created space so we start with a clean slate. */
1734	bzero(a, size);
1735	/*
1736	 * Set the attribute size in the newly created attribute, now at @a.
1737	 * We do this here so that the caller does not need to worry about
1738	 * rounding up the size to set the attribute length.
1739	 */
1740	a->length = cpu_to_le32(size);
1741	return 0;
1742}
1743
1744/**
1745 * ntfs_attr_record_resize - resize an attribute record
1746 * @m:		mft record containing attribute record
1747 * @a:		attribute record to resize
1748 * @new_size:	new size in bytes to which to resize the attribute record @a
1749 *
1750 * Resize the attribute record @a, i.e. the resident part of the attribute, in
1751 * the mft record @m to @new_size bytes.
1752 *
1753 * Return 0 on success and errno on error.  The following error codes are
1754 * defined:
1755 *	ENOSPC	- Not enough space in the mft record @m to perform the resize.
1756 *
1757 * Note: On error, no modifications have been performed whatsoever.
1758 *
1759 * Warning: If you make a record smaller without having copied all the data you
1760 *	    are interested in the data may be overwritten.
1761 */
1762errno_t ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size)
1763{
1764	const u32 old_size = le32_to_cpu(a->length);
1765
1766	ntfs_debug("Entering for new_size %u.", new_size);
1767	/* Align to 8 bytes if it is not already done. */
1768	if (new_size & 7)
1769		new_size = (new_size + 7) & ~7;
1770	/* If the actual attribute length has changed, move things around. */
1771	if (new_size != old_size) {
1772		const u32 muse = le32_to_cpu(m->bytes_in_use);
1773		const u32 new_muse = muse - old_size + new_size;
1774		/* Not enough space in this mft record. */
1775		if (new_muse > le32_to_cpu(m->bytes_allocated))
1776			return ENOSPC;
1777		/* Move attributes following @a to their new location. */
1778		memmove((u8*)a + new_size, (u8*)a + old_size,
1779				muse - ((u8*)a - (u8*)m) - old_size);
1780		/* Adjust @m to reflect the change in used space. */
1781		m->bytes_in_use = cpu_to_le32(new_muse);
1782		/* Adjust @a to reflect the new size. */
1783		if (new_size >= offsetof(ATTR_REC, length) + sizeof(a->length))
1784			a->length = cpu_to_le32(new_size);
1785	}
1786	return 0;
1787}
1788
1789/**
1790 * ntfs_attr_mapping_pairs_update - update an attribute's mapping pairs array
1791 * @base_ni:	base ntfs inode to which the attribute belongs
1792 * @ni:		ntfs inode of attribute whose mapping pairs array to update
1793 * @first_vcn:	first vcn which to update in the mapping pairs array
1794 * @last_vcn:	last vcn which to update in the mapping pairs array
1795 * @ctx:	search context describing the attribute to work on or NULL
1796 *
1797 * Create or update the mapping pairs arrays from the locked runlist of the
1798 * attribute @ni, i.e. @ni->rl, starting at vcn @first_vcn and finishing with
1799 * vcn @last_vcn.  The update can actually start before @first_vcn and finish
1800 * after @last_vcn but guarantees to at least include the range between
1801 * @first_vcn and @last_vcn, inclusive.
1802 *
1803 * This function is called from a variety of places after clusters have been
1804 * allocated to and/or freed from an attribute.  The runlist has already been
1805 * updated to reflect the allocated/freed clusters.  This functions takes the
1806 * modified runlist range and syncs it to the attribute record(s) by
1807 * compressing the runlist into mapping pairs array fragments and writing them
1808 * into the attribute record(s) of the attribute.
1809 *
1810 * This function also updates the attribute sizes using the values from the
1811 * ntfs inode @ni and syncs them to the base attribute record and if the
1812 * attribute has become sparse but the attribute record is not marked sparse or
1813 * the attribute is no longer sparse but the attribute record is marked sparse
1814 * the base attribute record is updated to reflect the changed state which
1815 * involves setting/clearing the sparse flag as well as the addition/removal of
1816 * the compressed size to the attribute record.  When the compressed size is
1817 * added this can lead to a larger portion of the mapping pairs array being
1818 * updated because there may not be enough space in the mft record to extend
1819 * the base attribute record to fit the compressed size.  When updating the
1820 * attribute record the compression state of the attribute is also taken into
1821 * consideration as the compressed size is used both with compressed and sparse
1822 * attributes.
1823 *
1824 * The update can involve the allocation/freeing of extent mft records and/or
1825 * extent attribute records.  If this happens the attribute list attribute in
1826 * the base ntfs inode @base_ni is updated appropriately both in memory and in
1827 * the attribute list attribute record in the base mft record.
1828 *
1829 * A @last_vcn of -1 means end of runlist and in that case the mapping pairs
1830 * array corresponding to the runlist starting at vcn @first_vcn and finishing
1831 * at the end of the runlist is updated.
1832 *
1833 * If @ctx is NULL, it is assumed that the attribute mft record is not mapped
1834 * and hence a new search context is allocated, the mft record is mapped, and
1835 * the attribute is looked up.  On completion the allocated search context is
1836 * released if it was allocated by ntfs_attr_mapping_pairs_update().
1837 *
1838 * Return 0 on success and errno on error.
1839 *
1840 * Locking: The runlist @ni->rl must be locked for writing, it remains locked
1841 *	    throughout, and is left locked upon return.
1842 */
1843#if 0
1844errno_t ntfs_attr_mapping_pairs_update(ntfs_inode *base_ni, ntfs_inode *ni,
1845		VCN first_vcn, VCN last_vcn, ntfs_attr_search_ctx *ctx)
1846{
1847	VCN lowest_vcn, highest_vcn, stop_vcn;
1848	ntfs_volume *vol;
1849	ATTR_RECORD *a;
1850	errno_t err;
1851	BOOL mpa_is_valid, was_sparse, is_sparse;
1852	ntfs_attr_search_ctx attr_ctx;
1853
1854	ntfs_debug("Entering for base mft_no 0x%llx, attribute type 0x%x, "
1855			"name len 0x%x, first_vcn 0x%llx, last_vcn 0x%llx, "
1856			"ctx is %spresent.",
1857			(unsigned long long)base_ni->mft_no,
1858			(unsigned)le32_to_cpu(ni->type), ni->name_len,
1859			(unsigned long long)first_vcn,
1860			(unsigned long long)last_vcn,
1861			ctx ? "" : "not ");
1862	vol = base_ni->vol;
1863	/*
1864	 * If no search context was specified use ours, initialize it, and look
1865	 * up the base attribute record so we can update the sizes, flags, and
1866	 * add/remove the compressed size if needed.
1867	 *
1868	 * We also need to look up the base attribute record if a search
1869	 * context was specified but it points to an extent attribute record.
1870	 */
1871	if (!ctx || ctx->a->lowest_vcn) {
1872		if (!ctx) {
1873			MFT_RECORD *base_m;
1874
1875			err = ntfs_mft_record_map(base_ni, &base_m);
1876			if (err) {
1877				ntfs_error(vol->mp, "Failed to map mft_no "
1878						"0x%llx (error %d).",
1879						(unsigned long long)
1880						base_ni->mft_no, err);
1881				return err;
1882			}
1883			ctx = &attr_ctx;
1884			ntfs_attr_search_ctx_init(ctx, base_ni, base_m);
1885		}
1886		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 0,
1887				NULL, 0, ctx);
1888		if (err) {
1889			if (err == ENOENT)
1890				err = EIO;
1891			ntfs_error(vol->mp, "Failed to look up base attribute "
1892					"record in mft_no 0x%llx (error %d).",
1893					(unsigned long long)base_ni->mft_no,
1894					err);
1895			goto err;
1896		}
1897	}
1898	a = ctx->a;
1899	if (!NInoNonResident(ni) || !a->non_resident)
1900		panic("%s(): !NInoNonResident(ni) || !a->non_resident\n",
1901				__FUNCTION__);
1902	mpa_is_valid = TRUE;
1903	/*
1904	 * If the attribute was sparse and is no longer sparse or it was not
1905	 * sparse and is now sparse, update the sparse state and add/remove the
1906	 * compressed size.
1907	 */
1908	was_sparse = a->flags & ATTR_IS_SPARSE;
1909	is_sparse = NInoSparse(ni);
1910	if (was_sparse == is_sparse)
1911		goto sparse_done;
1912	if (is_sparse) {
1913		a->flags |= ATTR_IS_SPARSE;
1914		if (NInoCompressed(ni))
1915			goto sparse_done;
1916		if (a->flags & ATTR_IS_COMPRESSED)
1917			panic("%s(): a->flags & ATTR_IS_COMPRESSED\n",
1918					__FUNCTION__);
1919		/*
1920		 * Add the compressed size and set up the relevant fields in
1921		 * the attribute record.
1922		 *
1923		 * If there is enough space in the mft record and we do not
1924		 * need to rewrite the mapping pairs array in this attribute
1925		 * record, resize the attribute record and move the mapping
1926		 * pairs array.
1927		 *
1928		 * If there is not enough space to perform the resize then do
1929		 * not preserve the mapping pairs array in this attribute
1930		 * record.
1931		 *
1932		 * If there still is not enough space to add the compressed
1933		 * size move the attribute record to an extent mft record (this
1934		 * cannot be the only attribute record in the current mft
1935		 * record).  If we do this do not preserve the mapping pairs
1936		 * array so we can make better use of the extent mft record.
1937		 *
1938		 * Note we need to ensure we have already mapped the runlist
1939		 * fragment described by the current mapping pairs array if we
1940		 * are not going to preserve it or we would lose the data.
1941		 */
1942		a->compression_unit = 0;
1943		if (vol->major_ver <= 1)
1944			a->compression_unit = NTFS_COMPRESSION_UNIT;
1945restart_compressed_size_add:
1946		if ((first_vcn > sle64_to_cpu(a->highest_vcn) + 1) &&
1947				!(err = ntfs_attr_record_resize(ctx->m, a,
1948				le32_to_cpu(a->length) +
1949				sizeof(a->compressed_size)))) {
1950			/*
1951			 * Move everything at the offset of the compressed size
1952			 * to make space for the compressed size.
1953			 */
1954			memmove((u8*)a + offsetof(ATTR_RECORD,
1955					compressed_size) +
1956					sizeof(a->compressed_size), (u8*)a +
1957					offsetof(ATTR_RECORD, compressed_size),
1958					le32_to_cpu(a->length) - offsetof(
1959					ATTR_RECORD, compressed_size));
1960			/*
1961			 * Update the name offset to match the moved data.  If
1962			 * there is no name then set the name offset to the
1963			 * correct position instead of adding to a potentially
1964			 * incorrect value.
1965			 */
1966			if (a->name_length)
1967				a->name_offset = cpu_to_le16(le16_to_cpu(
1968						a->name_offset) +
1969						sizeof(a->compressed_size));
1970			else
1971				a->name_offset = const_cpu_to_le16(offsetof(
1972						ATTR_RECORD, compressed_size) +
1973						sizeof(a->compressed_size));
1974			/* Update the mapping pairs offset. */
1975			mp_ofs = le16_to_cpu(a->mapping_pairs_offset) +
1976					sizeof(a->compressed_size);
1977			goto sparse_done;
1978		}
1979		/* Ensure this runlist fragment is mapped. */
1980		if (ni->allocated_size && (!ni->rl.elements ||
1981				ni->rl.rl->lcn == LCN_RL_NOT_MAPPED)) {
1982			err = ntfs_mapping_pairs_decompress(vol, a, &ni->rl);
1983			if (err) {
1984				ntfs_error(vol->mp, "Failed to decompress "
1985						"mapping pairs array (error "
1986						"%d).", err);
1987				goto err;
1988			}
1989		}
1990		/*
1991		 * Check whether the attribute is big enough to have the
1992		 * compressed size added to it.  We need at the very least
1993		 * space for the record header, the name, and a zero byte for
1994		 * an empty mapping pairs array and we need to allow for all
1995		 * the needed alignment padding.
1996		 */
1997		if (((sizeof(ATTR_RECORD) + a->name_length * sizeof(ntfschar) +
1998				7) & ~7) + 8 <= le32_to_cpu(a->length)) {
1999add_compressed_size:
2000			/*
2001			 * Move the name back to the new end of the attribute
2002			 * record header thus adding the compressed size.
2003			 */
2004			if (a->name_length)
2005				memmove((u8*)a + sizeof(ATTR_RECORD), (u8*)a +
2006						le16_to_cpu(a->name_offset),
2007						a->name_length *
2008						sizeof(ntfschar));
2009			/*
2010			 * Update the name offset and the mapping pairs offset
2011			 * to match the moved name.
2012			 */
2013			a->name_offset = const_cpu_to_le16(sizeof(ATTR_RECORD));
2014			a->mapping_pairs_offset = cpu_to_le16(
2015					(sizeof(ATTR_RECORD) + a->name_length *
2016					sizeof(ntfschar) + 7) & ~7);
2017			/*
2018			 * We no longer have a valid mapping pairs array in the
2019			 * current attribute record.
2020			 */
2021			mpa_is_valid = FALSE;
2022			goto sparse_done;
2023		}
2024		/*
2025		 * The attribute record is not big enough so try to extend it
2026		 * (in case we did not try to extend it above).
2027		 */
2028		err = ntfs_attr_record_resize(ctx->m, a,
2029				((sizeof(ATTR_RECORD) + a->name_length *
2030				sizeof(ntfschar) + 7) & ~7) + 8);
2031		if (!err)
2032			goto add_compressed_size;
2033		/*
2034		 * The attribute record cannot be the only one in the mft
2035		 * record if it is not large enough to hold an empty attribute
2036		 * record and there is not enough space to grow it.
2037		 */
2038		if (ntfs_attr_record_is_only_one(ctx->m, a))
2039			panic("%s(): ntfs_attr_is_only_one(ctx->m, a)\n",
2040					__FUNCTION__);
2041		/*
2042		 * This is our last resort.  Move the attribute to an extent
2043		 * mft record.
2044		 *
2045		 * First, add the attribute list attribute if it is not already
2046		 * present.
2047		 */
2048		if (!NInoAttrList(base_ni)) {
2049			err = ntfs_attr_list_add(base_ni, ctx->m, ctx);
2050			if (err || ctx->is_error) {
2051				if (!err)
2052					err = ctx->error;
2053				ntfs_error(vol->mp, "Failed to %s mft_no "
2054						"0x%llx (error %d).",
2055						ctx->is_error ?
2056						"remap extent mft record of" :
2057						"add attribute list attribute "
2058						"to", (unsigned long long)
2059						base_ni->mft_no, err);
2060				goto err;
2061			}
2062			/*
2063			 * The attribute location will have changed so update
2064			 * it from the search context.
2065			 */
2066			a = ctx->a;
2067			/*
2068			 * Retry the attribute record resize as we may now have
2069			 * enough space to add the compressed size.
2070			 *
2071			 * This can for example happen when the attribute was
2072			 * moved out to an extent mft record which has much
2073			 * more free space than the base mft record had or of
2074			 * course other attributes may have been moved out to
2075			 * extent mft records which has created enough space in
2076			 * the base mft record.
2077			 *
2078			 * If the attribute record was moved to an empty extent
2079			 * mft record this is the same case as if we moved the
2080			 * attribute record below so treat it the same, i.e. we
2081			 * do not preserve the mapping pairs array and use the
2082			 * maximum possible size for the mft record to allow us
2083			 * to consolidate the mapping pairs arrays.
2084			 */
2085			if (ntfs_attr_record_is_only_one(ctx->m, a))
2086				goto attr_is_only_one;
2087			goto restart_compressed_size_add;
2088		}
2089		/* Move the attribute to an extent mft record. */
2090		lck_rw_lock_shared(&base_ni->attr_list_rl.lock);
2091		err = ntfs_attr_record_move(ctx);
2092		lck_rw_unlock_shared(&base_ni->attr_list_rl.lock);
2093		if (err) {
2094			ntfs_error(vol->mp, "Failed to move attribute extent "
2095					"from mft record 0x%llx to an extent "
2096					"mft record (error %d).",
2097					(unsigned long long)ctx->ni->mft_no,
2098					err);
2099			/*
2100			 * We could try to remove the attribute list attribute
2101			 * if we added it above but this will require
2102			 * attributes to be moved back into the base mft record
2103			 * from extent mft records so is a lot of work and
2104			 * given we are in an error code path and given that it
2105			 * is ok to just leave the inode with an attribute list
2106			 * attribute we do not bother and just bail out.
2107			 */
2108			goto err;
2109		}
2110		/*
2111		 * The attribute location will have changed so update it from
2112		 * the search context.
2113		 */
2114		a = ctx->a;
2115attr_is_only_one:
2116		/*
2117		 * We now have enough space to add the compressed size so
2118		 * resize the attribute record.  Note we do not want to
2119		 * preserve the mapping pairs array as we will have
2120		 * significanly more space in the extent mft record thus we
2121		 * want to consolidate the mapping pairs arrays which is why we
2122		 * resize straight to the maximum possible size for the mft
2123		 * record.
2124		 */
2125		err = ntfs_attr_record_resize(ctx->m, a,
2126				le32_to_cpu(m->bytes_allocated) -
2127				le32_to_cpu(m->bytes_in_use) +
2128				le32_to_cpu(a->length));
2129		if (err)
2130			panic("%s(): err - resize failed\n", __FUNCTION__);
2131		if (((sizeof(ATTR_RECORD) + a->name_length * sizeof(ntfschar) +
2132				7) & ~7) + 8 > le32_to_cpu(a->length))
2133			panic("%s(): attribute record is still too small\n",
2134					__FUNCTION__);
2135		goto add_compressed_size;
2136	}
2137	/* The attribute is becoming non-sparse. */
2138	a->flags &= ~ATTR_IS_SPARSE;
2139	if (NInoCompressed(ni))
2140		goto sparse_done;
2141	if (a->flags & ATTR_IS_COMPRESSED)
2142		panic("%s(): a->flags & ATTR_IS_COMPRESSED\n", __FUNCTION__);
2143	/*
2144	 * Remove the compressed size and set up the relevant fields in the
2145	 * attribute record.
2146	 *
2147	 * If we do not need to rewrite the mapping pairs array in this
2148	 * attribute record, move the mapping pairs array and then resize the
2149	 * attribute record.
2150	 *
2151	 * Note we need to ensure we have already mapped the runlist fragment
2152	 * described by the current mapping pairs array if we are not going to
2153	 * preserve it or we would lose the data.
2154	 */
2155	a->compression_unit = 0;
2156	if (first_vcn > sle64_to_cpu(a->highest_vcn) + 1) {
2157		/*
2158		 * Move everything after the compressed size forward to the
2159		 * offset of the compressed size thus deleting the compressed
2160		 * size.
2161		 */
2162		memmove((u8*)a + offsetof(ATTR_RECORD, compressed_size),
2163				(u8*)a + offsetof(ATTR_RECORD,
2164				compressed_size) + sizeof(a->compressed_size),
2165				le32_to_cpu(a->length) - (offsetof(ATTR_RECORD,
2166				compressed_size) + sizeof(a->compressed_size)));
2167		/*
2168		 * Update the name offset and the mapping pairs offset to match
2169		 * the moved data.  If there is no name then set the name
2170		 * offset to the correct position instead of subtracting from a
2171		 * potentially incorrect value.
2172		 */
2173		if (!a->name_length)
2174			a->name_offset = const_cpu_to_le16(offsetof(ATTR_RECORD,
2175					compressed_size));
2176		else
2177			a->name_offset = cpu_to_le16(
2178					le16_to_cpu(a->name_offset) -
2179					sizeof(a->compressed_size));
2180		a->mapping_pairs_offset = cpu_to_le16(
2181				le16_to_cpu(a->mapping_pairs_offset) -
2182				sizeof(a->compressed_size));
2183		/*
2184		 * Shrink the attribute record to reflect the removal of the
2185		 * compressed size.  Note this cannot fail since we are making
2186		 * the attribute smaller thus by definition there there is
2187		 * enough space to do so.
2188		 */
2189		err = ntfs_attr_record_resize(ctx->m, a, le32_to_cpu(
2190				a->length) - sizeof(a->compressed_size));
2191		if (err)
2192			panic("%s(): err\n", __FUNCTION__);
2193		goto sparse_done;
2194	}
2195	/* Ensure this runlist fragment is mapped. */
2196	if (ni->allocated_size && (!ni->rl.elements ||
2197			ni->rl.rl->lcn == LCN_RL_NOT_MAPPED)) {
2198		err = ntfs_mapping_pairs_decompress(vol, a, &ni->rl);
2199		if (err) {
2200			ntfs_error(vol->mp, "Failed to decompress mapping "
2201					"pairs array (error %d).", err);
2202			goto err;
2203		}
2204	}
2205	mpa_is_valid = FALSE;
2206	/*
2207	 * Move the name forward to the offset of the compressed size thus
2208	 * deleting the compressed size.
2209	 */
2210	if (a->name_length)
2211		memmove((u8*)a + offsetof(ATTR_RECORD, compressed_size),
2212				(u8*)a + le16_to_cpu(a->name_offset),
2213				a->name_length * sizeof(ntfschar));
2214	/*
2215	 * Update the name offset and the mapping pairs offset to match the
2216	 * moved name.
2217	 */
2218	a->name_offset = const_cpu_to_le16(
2219			offsetof(ATTR_RECORD, compressed_size));
2220	a->mapping_pairs_offset = cpu_to_le16(
2221			(offsetof(ATTR_RECORD, compressed_size) +
2222			(a->name_length * sizeof(ntfschar)) + 7) & ~7);
2223sparse_done:
2224	/*
2225	 * Update the attribute sizes.
2226	 *
2227	 * TODO: Need to figure out whether we really need to update the data
2228	 * and initialized sizes or whether updating just the allocated and
2229	 * compressed sizes is sufficient in which case we can save a few CPU
2230	 * cycles by not updating the data and initialized sizes here.
2231	 */
2232	lck_spin_lock(&ni->size_lock);
2233	a->allocated_size = cpu_to_sle64(ni->allocated_size);
2234	a->data_size = cpu_to_sle64(ni->data_size);
2235	a->initialized_size = cpu_to_sle64(ni->initialized_size);
2236	if (a->flags & (ATTR_IS_COMPRESSED | ATTR_IS_SPARSE))
2237		a->compressed_size = cpu_to_sle64(ni->compressed_size);
2238	lck_spin_unlock(&ni->size_lock);
2239	/*
2240	 * If the current mapping pairs array is valid and the first vcn at
2241	 * which we need to update the mapping pairs array is not in this
2242	 * attribute extent, look up the attribute extent containing the first
2243	 * vcn.
2244	 */
2245	if (mpa_is_valid && first_vcn > sle64_to_cpu(a->highest_vcn) + 1) {
2246		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
2247				first_vcn, NULL, 0, ctx);
2248		if (err) {
2249			if (err == ENOENT)
2250				err = EIO;
2251			ntfs_error(vol->mp, "Failed to look up extent "
2252					"attribute record containing VCN "
2253					"0x%llx in mft_no 0x%llx (error %d).",
2254					(unsigned long long)first_vcn,
2255					(unsigned long long)base_ni->mft_no,
2256					err);
2257			goto err;
2258		}
2259		a = ctx->a;
2260	}
2261	/*
2262	 * We need to rebuild the mapping pairs array in this attribute extent.
2263	 * But first, check if we can grow the attribute extent.  If this is
2264	 * the base extent and the attribute is not sparse nor compressed and
2265	 * it is allowed to be sparse then reserve the size of the compressed
2266	 * size field in the mft record so it is easier to make the attribute
2267	 * sparse later on.
2268	 *
2269	 * FIXME: But we don't want to do that if the attribute extent is in
2270	 * the base mft record and the attribute is $DATA or $INDEX_ALLOCATION,
2271	 * etc as we want to keep the first extent of theese base attribute
2272	 * extents in the base mft record thus we have to keep them small to
2273	 * allow the attribute list attribute to grow over time.
2274	 *
2275	 * FIXME: Need to make sure we map any unmapped regions of the runlist
2276	 * when determining the size of the mapping pairs array.
2277	 *
2278	 * FIXME: If we don't impose a last vcn when getting the size it would
2279	 * just cause the entirety of the mapping pairs array starting with the
2280	 * current extent to be mapped in, which is not necessarilly a bad
2281	 * thing as it will then be already mapped for all subsequent writes.
2282	 *
2283	 * FIXME: We do not want to keep rewriting the entire mapping pairs
2284	 * array every time we fill a hole so need to be careful when
2285	 * consolidating the mapping pairs array fragments.  OTOH we do not
2286	 * want to end up with millions of very short attribute extents so need
2287	 * to be careful about that, too.
2288	 */
2289// TODO: I AM HERE:
2290	ntfs_error(vol->mp, "FIXME: TODO...");
2291	return ENOTSUP;
2292	ntfs_debug("Done.");
2293	return 0;
2294err:
2295	/*
2296	 * If we mapped the mft record and looked up the attribute, release the
2297	 * mapped mft record(s) here.
2298	 */
2299	if (ctx == &attr_ctx) {
2300		if (ctx->ni != base_ni)
2301			ntfs_extent_mft_record_unmap(ctx->ni);
2302		ntfs_mft_record_unmap(base_ni);
2303	}
2304	return err;
2305}
2306#endif
2307
2308/**
2309 * ntfs_resident_attr_record_insert_internal - insert a resident attribute
2310 * @m:		mft record in which to insert the resident attribute
2311 * @a:		attribute in front of which to insert the new attribute
2312 * @type:	attribute type of new attribute
2313 * @name:	Unicode name of new attribute
2314 * @name_len:	Unicode character size of name of new attribute
2315 * @val_len:	byte size of attribute value of new attribute
2316 *
2317 * Insert a new resident attribute in the mft record @m, in front of the
2318 * existing attribute record @a.  The new attribute is of type @type, and has a
2319 * name of @name which is @name_len Unicode characters long.  The new attribute
2320 * value is @val_len bytes and is initialized to zero.
2321 *
2322 * Note: If the inode uses the attribute list attribute the caller is
2323 * responsible for adding an entry for the inserted attribute to the attribute
2324 * list attribute.
2325 *
2326 * Return 0 on success and errno on error.  The following error codes are
2327 * defined:
2328 *	ENOSPC - Not enough space in the mft record @m.
2329 *
2330 * Note: On error, no modifications have been performed whatsoever.
2331 */
2332errno_t ntfs_resident_attr_record_insert_internal(MFT_RECORD *m,
2333		ATTR_RECORD *a, const ATTR_TYPE type, const ntfschar *name,
2334		const u8 name_len, const u32 val_len)
2335{
2336	unsigned name_ofs, val_ofs;
2337
2338	/*
2339	 * Calculate the offset into the new attribute at which the attribute
2340	 * name begins.  The name is placed directly after the resident
2341	 * attribute record itself.
2342	 */
2343	name_ofs = offsetof(ATTR_RECORD, reservedR) + sizeof(a->reservedR);
2344	/*
2345	 * Calculate the offset into the new attribute at which the attribute
2346	 * value begins.  The attribute value is placed after the name aligned
2347	 * to an 8-byte boundary.
2348	 */
2349	val_ofs = name_ofs + (((name_len << NTFSCHAR_SIZE_SHIFT) + 7) & ~7);
2350	/*
2351	 * Work out the size for the attribute record.  We simply take the
2352	 * offset to the attribute value we worked out above and add the size
2353	 * of the attribute value in bytes aligned to an 8-byte boundary.  Note
2354	 * we do not need to do the alignment as ntfs_attr_record_make_space()
2355	 * does it anyway.
2356	 */
2357	if (ntfs_attr_record_make_space(m, a, val_ofs + val_len))
2358		return ENOSPC;
2359	/*
2360	 * Now setup the new attribute record.  The entire attribute has been
2361	 * zeroed and the length of the attribute record has been set up by
2362	 * ntfs_attr_record_make_space().
2363	 */
2364	a->type = type;
2365	a->name_length = name_len;
2366	a->name_offset = cpu_to_le16(name_ofs);
2367	a->instance = m->next_attr_instance;
2368	/*
2369	 * Increment the next attribute instance number in the mft record as we
2370	 * consumed the old one.
2371	 */
2372	m->next_attr_instance = cpu_to_le16(
2373			(le16_to_cpu(m->next_attr_instance) + 1) & 0xffff);
2374	a->value_length = cpu_to_le32(val_len);
2375	a->value_offset = cpu_to_le16(val_ofs);
2376	if (type == AT_FILENAME)
2377		a->resident_flags = RESIDENT_ATTR_IS_INDEXED;
2378	/* Copy the attribute name into place. */
2379	if (name_len)
2380		memcpy((u8*)a + name_ofs, name,
2381				name_len << NTFSCHAR_SIZE_SHIFT);
2382	return 0;
2383}
2384
2385/**
2386 * ntfs_resident_attr_record_insert - insert a resident attribute record
2387 * @ni:		base ntfs inode to which the attribute is being added
2388 * @ctx:	search context describing where to insert the resident attribute
2389 * @type:	attribute type of new attribute
2390 * @name:	Unicode name of new attribute
2391 * @name_len:	Unicode character size of name of new attribute
2392 * @val:	attribute value of new attribute (optional, can be NULL)
2393 * @val_len:	byte size of attribute value of new attribute
2394 *
2395 * Insert a new resident attribute in the base ntfs inode @ni at the position
2396 * indicated by the attribute search context @ctx and add an attribute list
2397 * attribute entry for it if the inode uses the attribute list attribute.
2398 *
2399 * The new attribute is of type @type, has a name of @name which is @name_len
2400 * Unicode characters long, and has a value of @val with size @val_len bytes.
2401 * If @val is NULL, the value of size @val_len is zeroed.
2402 *
2403 * If @val is NULL, the caller is responsible for marking the extent mft record
2404 * the attribute is in dirty.  We do it this way because we assume the caller
2405 * is going to modify the attribute further and will then mark it dirty.
2406 *
2407 * If the attribute is in the base mft record then the caller is always
2408 * responsible for marking the mft record dirty.
2409 *
2410 * Return 0 on success and errno on error.
2411 *
2412 * WARNING: Regardless of whether success or failure is returned, you need to
2413 *	    check @ctx->is_error and if 1 the @ctx is no longer valid, i.e. you
2414 *	    need to either call ntfs_attr_search_ctx_reinit() or
2415 *	    ntfs_attr_search_ctx_put() on it.  In that case @ctx->error will
2416 *	    give you the error code for why the mapping of the inode failed.
2417 */
2418errno_t ntfs_resident_attr_record_insert(ntfs_inode *ni,
2419		ntfs_attr_search_ctx *ctx, const ATTR_TYPE type,
2420		const ntfschar *name, const u8 name_len,
2421		const void *val, const u32 val_len)
2422{
2423	ntfs_volume *vol;
2424	MFT_RECORD *base_m, *m;
2425	ATTR_RECORD *a;
2426	ATTR_LIST_ENTRY *al_entry;
2427	unsigned name_ofs, val_ofs, al_entry_used, al_entry_len, new_al_size;
2428	unsigned new_al_alloc;
2429	errno_t err;
2430	BOOL al_entry_added;
2431
2432	ntfs_debug("Entering for mft_no 0x%llx, attribute type 0x%x, name_len "
2433			"0x%x, val_len 0x%x.", (unsigned long long)ni->mft_no,
2434			(unsigned)le32_to_cpu(type), name_len, val_len);
2435	vol = ni->vol;
2436	/*
2437	 * Calculate the offset into the new attribute at which the attribute
2438	 * name begins.  The name is placed directly after the resident
2439	 * attribute record itself.
2440	 */
2441	name_ofs = offsetof(ATTR_RECORD, reservedR) + sizeof(a->reservedR);
2442	/*
2443	 * Calculate the offset into the new attribute at which the attribute
2444	 * value begins.  The attribute value is placed after the name aligned
2445	 * to an 8-byte boundary.
2446	 */
2447	val_ofs = name_ofs + (((name_len << NTFSCHAR_SIZE_SHIFT) + 7) & ~7);
2448	/*
2449	 * Work out the size for the attribute record.  We simply take the
2450	 * offset to the attribute value we worked out above and add the size
2451	 * of the attribute value in bytes aligned to an 8-byte boundary.  Note
2452	 * we do not need to do the alignment as ntfs_attr_record_make_space()
2453	 * does it anyway.
2454	 */
2455	/*
2456	 * The current implementation of ntfs_attr_lookup() will always return
2457	 * pointing into the base mft record when an attribute is not found.
2458	 */
2459	base_m = ctx->m;
2460retry:
2461	if (ni != ctx->ni)
2462		panic("%s(): ni != ctx->ni\n", __FUNCTION__);
2463	m = ctx->m;
2464	a = ctx->a;
2465	err = ntfs_attr_record_make_space(m, a, val_ofs + val_len);
2466	if (err) {
2467		ntfs_inode *eni;
2468
2469		if (err != ENOSPC)
2470			panic("%s(): err != ENOSPC\n", __FUNCTION__);
2471		/*
2472		 * There was not enough space in the mft record to insert the
2473		 * new attribute record which means we will need to insert it
2474		 * into an extent mft record.
2475		 *
2476		 * To avoid bugs and impossible situations, check that the
2477		 * attribute is not already the only attribute in the mft
2478		 * record otherwise moving it would not give us anything.
2479		 */
2480		if (ntfs_attr_record_is_only_one(m, a))
2481			panic("%s(): ntfs_attr_record_is_only_one(m, a)\n",
2482					__FUNCTION__);
2483		/*
2484		 * Before we can allocate an extent mft record, we need to
2485		 * ensure that the inode has an attribute list attribute.
2486		 */
2487		if (!NInoAttrList(ni)) {
2488			err = ntfs_attr_list_add(ni, m, NULL);
2489			if (err) {
2490				ntfs_error(vol->mp, "Failed to add attribute "
2491						"list attribute to mft_no "
2492						"0x%llx (error %d).",
2493						(unsigned long long)ni->mft_no,
2494						err);
2495				return err;
2496			}
2497			/*
2498			 * Adding the attribute list attribute may have
2499			 * generated enough space in the base mft record to
2500			 * fit the attribute so try again.
2501			 */
2502			ntfs_attr_search_ctx_reinit(ctx);
2503			err = ntfs_attr_lookup(type, name, name_len, 0, val,
2504					val_len, ctx);
2505			if (err == ENOENT) {
2506				/*
2507				 * The current implementation of
2508				 * ntfs_attr_lookup() will always return
2509				 * pointing into the base mft record when an
2510				 * attribute is not found.
2511				 */
2512				if (m != ctx->m)
2513					panic("%s(): m != ctx->m\n",
2514							__FUNCTION__);
2515				goto retry;
2516			}
2517			/*
2518			 * We cannot have found the attribute as we have
2519			 * exclusive access and know that it does not exist
2520			 * already.
2521			 */
2522			if (!err)
2523				panic("%s(): !err\n", __FUNCTION__);
2524			/*
2525			 * Something has gone wrong.  Note we have to bail out
2526			 * as a failing attribute lookup indicates corruption
2527			 * and/or disk failure and/or not enough memory all of
2528			 * which would prevent us from rolling back the
2529			 * attribute list attribute addition.
2530			 */
2531			ntfs_error(vol->mp, "Failed to add attribute type "
2532					"0x%x to mft_no 0x%llx because looking "
2533					"up the attribute failed (error %d).",
2534					(unsigned)le32_to_cpu(type),
2535					(unsigned long long)ni->mft_no, -err);
2536			return err;
2537		}
2538		/*
2539		 * We now need to allocate a new extent mft record, attach it
2540		 * to the base ntfs inode and set up the search context to
2541		 * point to it, then insert the new attribute into it.
2542		 */
2543		err = ntfs_mft_record_alloc(vol, NULL, NULL, ni, &eni, &m, &a);
2544		if (err) {
2545			ntfs_error(vol->mp, "Failed to add attribute type "
2546					"0x%x to mft_no 0x%llx because "
2547					"allocating a new extent mft record "
2548					"failed (error %d).",
2549					(unsigned)le32_to_cpu(type),
2550					(unsigned long long)ni->mft_no, err);
2551			/*
2552			 * If we added the attribute list attribute above we
2553			 * now remove it again.  This may require moving
2554			 * attributes back into the base mft record so is not a
2555			 * trivial amount of work and in the end it does not
2556			 * really matter if we leave an inode with an attribute
2557			 * list attribute that does not really need it.  So it
2558			 * will only be removed if there are no extent mft
2559			 * records at all, i.e. if adding the attribute list
2560			 * attribute did not cause any attribute records to be
2561			 * moved out to extent mft records.
2562			 */
2563			al_entry_added = FALSE;
2564			al_entry = NULL;
2565			goto remove_al;
2566		}
2567		ctx->m = m;
2568		ctx->a = a;
2569		ctx->ni = eni;
2570		/*
2571		 * Make space for the new attribute.  This cannot fail as we
2572		 * now have an empty mft record which by definition can hold
2573		 * a maximum size resident attribute record.
2574		 */
2575		err = ntfs_attr_record_make_space(m, a, val_ofs + val_len);
2576		if (err)
2577			panic("%s(): err (ntfs_attr_record_make_space())\n",
2578					__FUNCTION__);
2579	}
2580	/*
2581	 * Now setup the new attribute record.  The entire attribute has been
2582	 * zeroed and the length of the attribute record has been set up by
2583	 * ntfs_attr_record_make_space().
2584	 */
2585	a->type = type;
2586	a->name_length = name_len;
2587	a->name_offset = const_cpu_to_le16(offsetof(ATTR_RECORD, reservedR) +
2588			sizeof(a->reservedR));
2589	a->instance = m->next_attr_instance;
2590	/*
2591	 * Increment the next attribute instance number in the mft record as we
2592	 * consumed the old one.
2593	 */
2594	m->next_attr_instance = cpu_to_le16(
2595			(le16_to_cpu(m->next_attr_instance) + 1) & 0xffff);
2596	a->value_length = cpu_to_le32(val_len);
2597	a->value_offset = cpu_to_le16(val_ofs);
2598	if (type == AT_FILENAME)
2599		a->resident_flags = RESIDENT_ATTR_IS_INDEXED;
2600	/* Copy the attribute name into place. */
2601	if (name_len)
2602		memcpy((u8*)a + name_ofs, name,
2603				name_len << NTFSCHAR_SIZE_SHIFT);
2604	/* If a value is specified, copy it into place. */
2605	if (val) {
2606		memcpy((u8*)a + le16_to_cpu(a->value_offset), val, val_len);
2607		/*
2608		 * Ensure the mft record containing the new filename attribute
2609		 * gets written out.
2610		 */
2611		if (ctx->ni != ni)
2612			NInoSetMrecNeedsDirtying(ctx->ni);
2613	}
2614	/*
2615	 * If the inode does not use the attribute list attribute we are done.
2616	 *
2617	 * If the inode uses the attribute list attribute (including the case
2618	 * where we just created it), we need to add an attribute list
2619	 * attribute entry for the attribute.
2620	 */
2621	if (!NInoAttrList(ni))
2622		goto done;
2623	/* Add an attribute list attribute entry for the inserted attribute. */
2624	al_entry = ctx->al_entry;
2625	al_entry_used = offsetof(ATTR_LIST_ENTRY, name) +
2626			(name_len << NTFSCHAR_SIZE_SHIFT);
2627	al_entry_len = (al_entry_used + 7) & ~7;
2628	new_al_size = ni->attr_list_size + al_entry_len;
2629	/* Out of bounds checks. */
2630	if ((u8*)al_entry < ni->attr_list ||
2631			(u8*)al_entry > ni->attr_list + new_al_size ||
2632			(u8*)al_entry + al_entry_len >
2633			ni->attr_list + new_al_size) {
2634		/* Inode is corrupt. */
2635		ntfs_error(vol->mp, "Mft_no 0x%llx is corrupt.  Run chkdsk.",
2636				(unsigned long long)ni->mft_no);
2637		err = EIO;
2638		goto undo;
2639	}
2640	err = ntfs_attr_size_bounds_check(vol, AT_ATTRIBUTE_LIST, new_al_size);
2641	if (err) {
2642		if (err == ERANGE) {
2643			ntfs_error(vol->mp, "Cannot insert attribute into "
2644					"mft_no 0x%llx because the attribute "
2645					"list attribute would become too "
2646					"large.  You need to defragment your "
2647					"volume and then try again.",
2648					(unsigned long long)ni->mft_no);
2649			err = ENOSPC;
2650		} else {
2651			ntfs_error(vol->mp, "Attribute list attribute is "
2652					"unknown on the volume.  The volume "
2653					"is corrupt.  Run chkdsk.");
2654			NVolSetErrors(vol);
2655			err = EIO;
2656		}
2657		goto undo;
2658	}
2659	/*
2660	 * Reallocate the memory buffer if needed and create space for the new
2661	 * entry.
2662	 */
2663	new_al_alloc = (new_al_size + NTFS_ALLOC_BLOCK - 1) &
2664			~(NTFS_ALLOC_BLOCK - 1);
2665	if (new_al_alloc > ni->attr_list_alloc) {
2666		u8 *tmp, *al, *al_end;
2667		unsigned al_entry_ofs;
2668
2669		tmp = OSMalloc(new_al_alloc, ntfs_malloc_tag);
2670		if (!tmp) {
2671			ntfs_error(vol->mp, "Not enough memory to extend "
2672					"attribute list attribute of mft_no "
2673					"0x%llx.",
2674					(unsigned long long)ni->mft_no);
2675			err = ENOMEM;
2676			goto undo;
2677		}
2678		al = ni->attr_list;
2679		al_entry_ofs = (u8*)al_entry - al;
2680		al_end = al + ni->attr_list_size;
2681		memcpy(tmp, al, al_entry_ofs);
2682		if ((u8*)al_entry < al_end)
2683			memcpy(tmp + al_entry_ofs + al_entry_len,
2684					al + al_entry_ofs,
2685					ni->attr_list_size - al_entry_ofs);
2686		al_entry = ctx->al_entry = (ATTR_LIST_ENTRY*)(tmp +
2687				al_entry_ofs);
2688		OSFree(ni->attr_list, ni->attr_list_alloc, ntfs_malloc_tag);
2689		ni->attr_list_alloc = new_al_alloc;
2690		ni->attr_list = tmp;
2691	} else if ((u8*)al_entry < ni->attr_list + ni->attr_list_size)
2692		memmove((u8*)al_entry + al_entry_len, al_entry,
2693				ni->attr_list_size - ((u8*)al_entry -
2694				ni->attr_list));
2695	ni->attr_list_size = new_al_size;
2696	/* Set up the attribute list entry. */
2697	al_entry->type = type;
2698	al_entry->length = cpu_to_le16(al_entry_len);
2699	al_entry->name_length = name_len;
2700	al_entry->name_offset = offsetof(ATTR_LIST_ENTRY, name);
2701	al_entry->lowest_vcn = 0;
2702	al_entry->mft_reference = MK_LE_MREF(ctx->ni->mft_no, ctx->ni->seq_no);
2703	al_entry->instance = a->instance;
2704	/* Copy the attribute name into place. */
2705	if (name_len)
2706		memcpy((u8*)&al_entry->name, name,
2707				name_len << NTFSCHAR_SIZE_SHIFT);
2708	/* For tidyness, zero any unused space. */
2709	if (al_entry_len != al_entry_used) {
2710		if (al_entry_len < al_entry_used)
2711			panic("%s(): al_entry_len < al_entry_used\n",
2712					__FUNCTION__);
2713		memset((u8*)al_entry + al_entry_used, 0,
2714				al_entry_len - al_entry_used);
2715	}
2716	/*
2717	 * Extend the attribute list attribute and copy in the modified
2718	 * value from the cache.
2719	 */
2720	err = ntfs_attr_list_sync_extend(ni, base_m,
2721			(u8*)al_entry - ni->attr_list, ctx);
2722	if (err) {
2723		ntfs_error(vol->mp, "Failed to extend attribute list "
2724				"attribute of mft_no 0x%llx (error %d).",
2725				(unsigned long long)ni->mft_no, err);
2726		al_entry_added = TRUE;
2727		goto undo_al;
2728	}
2729done:
2730	ntfs_debug("Done.");
2731	return 0;
2732undo:
2733	al_entry_added = FALSE;
2734undo_al:
2735	/*
2736	 * Need to remove the attribute again or free the extent mft record if
2737	 * there are no attributes remaining in it.
2738	 */
2739	if (m == base_m || !ntfs_attr_record_is_only_one(m, a)) {
2740		ntfs_attr_record_delete_internal(m, a);
2741		/*
2742		 * If the attribute was not in the base mft record mark the
2743		 * extent mft record dirty so it gets written out later.  If
2744		 * the attribute was in the base mft record it will be marked
2745		 * dirty later.
2746		 *
2747		 * We also unmap the extent mft record and we set @ctx->ni to
2748		 * equal the base inode @ni so that the search context is
2749		 * initialized from scratch or simply freed if the caller
2750		 * reinitializes or releases the search context respectively.
2751		 */
2752		if (m != base_m) {
2753			NInoSetMrecNeedsDirtying(ctx->ni);
2754			ntfs_extent_mft_record_unmap(ctx->ni);
2755			ctx->ni = ni;
2756		}
2757	} else {
2758		int err2;
2759		BOOL al_needed;
2760
2761		err2 = ntfs_extent_mft_record_free(ni, ctx->ni, m);
2762		if (err2) {
2763			/*
2764			 * Ignore the error as we just end up with an unused
2765			 * mft record that is marked in use.
2766			 */
2767			ntfs_error(vol->mp, "Failed to free extent mft_no "
2768					"0x%llx (error %d).  Unmount and run "
2769					"chkdsk to recover the lost inode.",
2770					(unsigned long long)ctx->ni->mft_no,
2771					err2);
2772			NVolSetErrors(vol);
2773			/*
2774			 * Relese the extent mft record after dirtying it thus
2775			 * simulating the effect of freeing it.
2776			 */
2777			NInoSetMrecNeedsDirtying(ctx->ni);
2778			ntfs_extent_mft_record_unmap(ctx->ni);
2779		}
2780		/*
2781		 * The attribute search context still points to the no longer
2782		 * mapped extent inode thus we need to change it to point to
2783		 * the base inode instead so the context can be reinitialized
2784		 * or released safely.
2785		 */
2786		ctx->ni = ni;
2787remove_al:
2788		/*
2789		 * Check the attribute list attribute.  If there are no other
2790		 * attribute list attribute entries referencing extent mft
2791		 * records delete the attribute list attribute altogether.
2792		 *
2793		 * If this fails it does not matter as we simply retain the
2794		 * attribute list attribute so we ignore the error and go on to
2795		 * delete the attribute list attribute entry instead.
2796		 *
2797		 * If there are other attribute list attribute entries
2798		 * referencing extent mft records we still need the attribute
2799		 * list attribute thus we go on to delete the attribute list
2800		 * entry corresponding to the attribute record we just deleted
2801		 * by freeing its extent mft record.
2802		 */
2803		err2 = ntfs_attr_list_is_needed(ni,
2804				al_entry_added ? al_entry : NULL, &al_needed);
2805		if (err2)
2806			ntfs_warning(vol->mp, "Failed to determine if "
2807					"attribute list attribute of mft_no "
2808					"0x%llx if still needed (error %d).  "
2809					"Assuming it is still needed and "
2810					"continuing.",
2811					(unsigned long long)ni->mft_no, err2);
2812		else if (!al_needed) {
2813			/*
2814			 * No more extent mft records are in use.  Delete the
2815			 * attribute list attribute.
2816			 */
2817			ntfs_attr_search_ctx_reinit(ctx);
2818			err2 = ntfs_attr_list_delete(ni, ctx);
2819			if (!err2) {
2820				/*
2821				 * We deleted the attribute list attribute and
2822				 * this will have updated the base inode
2823				 * appropriately thus we have restored
2824				 * everything as it was before.
2825				 */
2826				return err;
2827			}
2828			ntfs_warning(vol->mp, "Failed to delete attribute "
2829					"list attribute of mft_no 0x%llx "
2830					"(error %d).  Continuing using "
2831					"alternative error recovery method.",
2832					(unsigned long long)ni->mft_no, err2);
2833		}
2834	}
2835	/*
2836	 * Both @ctx and @ni are now invalid and cannot be used any more which
2837	 * is fine as we have finished dealing with the attribute record.
2838	 *
2839	 * We now need to delete the corresponding attribute list attribute
2840	 * entry if we created it.
2841	 *
2842	 * Then we need to rewrite the attribute list attribute again because
2843	 * ntfs_attr_list_sync_extend() may have left it in an indeterminate
2844	 * state.
2845	 */
2846	if (al_entry_added) {
2847		int err2;
2848
2849		ntfs_attr_list_entry_delete(ni, al_entry);
2850		ntfs_attr_search_ctx_reinit(ctx);
2851		err2 = ntfs_attr_list_sync_shrink(ni, 0, ctx);
2852		if (err2) {
2853			ntfs_error(vol->mp, "Failed to restore attribute list "
2854					"attribute in base mft_no 0x%llx "
2855					"(error %d).  Leaving inconsistent "
2856					"metadata.  Unmount and run chkdsk.",
2857					(unsigned long long)ni->mft_no, err2);
2858			NVolSetErrors(vol);
2859		}
2860	}
2861	/* Make sure any changes are written out. */
2862	NInoSetMrecNeedsDirtying(ni);
2863	return err;
2864}
2865
2866/**
2867 * ntfs_resident_attr_value_resize - resize the value of a resident attribute
2868 * @m:		mft record containing attribute record
2869 * @a:		attribute record whose value to resize
2870 * @new_size:	new size in bytes to which to resize the attribute value of @a
2871 *
2872 * Resize the value of the attribute @a in the mft record @m to @new_size
2873 * bytes.  If the value is made bigger, the newly allocated space is cleared.
2874 *
2875 * Return 0 on success and errno on error.  The following error codes are
2876 * defined:
2877 *	ENOSPC	- Not enough space in the mft record @m to perform the resize.
2878 *
2879 * Note: On error, no modifications have been performed whatsoever.
2880 *
2881 * Warning: If you make a record smaller without having copied all the data you
2882 *	    are interested in the data may be overwritten.
2883 */
2884errno_t ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
2885		const u32 new_size)
2886{
2887	const u32 old_size = le32_to_cpu(a->value_length);
2888
2889	/* Resize the resident part of the attribute record. */
2890	if (ntfs_attr_record_resize(m, a, le16_to_cpu(a->value_offset) +
2891			new_size))
2892		return ENOSPC;
2893	/*
2894	 * The resize succeeded!  If we made the attribute value bigger, clear
2895	 * the area between the old size and @new_size.
2896	 */
2897	if (new_size > old_size)
2898		bzero((u8*)a + le16_to_cpu(a->value_offset) + old_size,
2899				new_size - old_size);
2900	/* Finally update the length of the attribute value. */
2901	a->value_length = cpu_to_le32(new_size);
2902	return 0;
2903}
2904
2905/**
2906 * ntfs_attr_make_non_resident - convert a resident to a non-resident attribute
2907 * @ni:		ntfs inode describing the attribute to convert
2908 *
2909 * Convert the resident ntfs attribute described by the ntfs inode @ni to a
2910 * non-resident one.
2911 *
2912 * Return 0 on success and errno on error.  The following error return codes
2913 * are defined:
2914 *	EPERM	- The attribute is not allowed to be non-resident.
2915 *	ENOMEM	- Not enough memory.
2916 *	ENOSPC	- Not enough disk space.
2917 *	EINVAL	- Attribute not defined on the volume.
2918 *	EIO	- I/o error or other error.
2919 *
2920 * Note that if an error other than EPERM is returned it is possible that the
2921 * attribute has been made non-resident but for example the attribute list
2922 * attribute failed to be written out thus the base mft record is now corrupt
2923 * and all operations should be aborted by the caller.
2924 *
2925 * Locking: The caller must hold @ni->lock on the inode for writing.
2926 */
2927errno_t ntfs_attr_make_non_resident(ntfs_inode *ni)
2928{
2929	leMFT_REF mref;
2930	s64 new_size, data_size;
2931	ntfs_volume *vol = ni->vol;
2932	ntfs_inode *base_ni;
2933	MFT_RECORD *base_m, *m;
2934	ATTR_RECORD *a;
2935	upl_t upl;
2936	upl_page_info_array_t pl;
2937	u8 *kaddr, *al_end;
2938	unsigned mp_size, mp_ofs, name_ofs, arec_size, attr_size, bytes_needed;
2939	unsigned al_ofs = 0;
2940	errno_t err, err2;
2941	le32 type;
2942	u8 old_res_attr_flags;
2943	ntfs_attr_search_ctx ctx, actx;
2944	BOOL al_dirty = FALSE;
2945
2946	/* Check that the attribute is allowed to be non-resident. */
2947	err = ntfs_attr_can_be_non_resident(vol, ni->type);
2948	if (err) {
2949		if (err == EPERM)
2950			ntfs_debug("Attribute is not allowed to be "
2951					"non-resident.");
2952		else
2953			ntfs_debug("Attribute not defined on the NTFS "
2954					"volume!");
2955		return err;
2956	}
2957	/*
2958	 * FIXME: Compressed and encrypted attributes are not supported when
2959	 * writing and we should never have gotten here for them.
2960	 */
2961	if (NInoCompressed(ni))
2962		panic("%s(): NInoCompressed(ni)\n", __FUNCTION__);
2963	if (NInoEncrypted(ni))
2964		panic("%s(): NInoEncrypted(ni)\n", __FUNCTION__);
2965	/*
2966	 * The size needs to be aligned to a cluster boundary for allocation
2967	 * purposes.
2968	 */
2969	lck_spin_lock(&ni->size_lock);
2970	data_size = ni->data_size;
2971	lck_spin_unlock(&ni->size_lock);
2972	new_size = (data_size + vol->cluster_size_mask) &
2973			~vol->cluster_size_mask;
2974	lck_rw_lock_exclusive(&ni->rl.lock);
2975	if (ni->rl.elements)
2976		panic("%s(): ni->rl.elements\n", __FUNCTION__);
2977	upl = NULL;
2978	if (new_size > 0) {
2979		/* Start by allocating clusters to hold the attribute value. */
2980		err = ntfs_cluster_alloc(vol, 0, new_size >>
2981				vol->cluster_size_shift, -1, DATA_ZONE, TRUE,
2982				&ni->rl);
2983		if (err) {
2984			if (err != ENOSPC)
2985				ntfs_error(vol->mp, "Failed to allocate "
2986						"cluster%s, error code %d.",
2987						(new_size >>
2988						vol->cluster_size_shift) > 1 ?
2989						"s" : "", err);
2990			goto unl_err;
2991		}
2992		/*
2993		 * Will need the page later and since the page lock nests
2994		 * outside all ntfs locks, we need to get the page now.
2995		 */
2996		err = ntfs_page_grab(ni, 0, &upl, &pl, &kaddr, TRUE);
2997		if (err)
2998			goto page_err;
2999	}
3000	/* Determine the size of the mapping pairs array. */
3001	err = ntfs_get_size_for_mapping_pairs(vol,
3002			ni->rl.elements ? ni->rl.rl : NULL, 0, -1, &mp_size);
3003	if (err) {
3004		ntfs_error(vol->mp, "Failed to get size for mapping pairs "
3005				"array (error %d).", err);
3006		goto rl_err;
3007	}
3008	base_ni = ni;
3009	if (NInoAttr(ni))
3010		base_ni = ni->base_ni;
3011	err = ntfs_mft_record_map(base_ni, &base_m);
3012	if (err)
3013		goto rl_err;
3014	ntfs_attr_search_ctx_init(&ctx, base_ni, base_m);
3015	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 0, NULL, 0,
3016			&ctx);
3017	if (err) {
3018		if (err == ENOENT)
3019			err = EIO;
3020		goto unm_err;
3021	}
3022	m = ctx.m;
3023	a = ctx.a;
3024	if (NInoNonResident(ni))
3025		panic("%s(): NINonResident(ni)\n", __FUNCTION__);
3026	if (a->non_resident)
3027		panic("%s(): a->non_resident\n", __FUNCTION__);
3028	/* Calculate new offsets for the name and the mapping pairs array. */
3029	name_ofs = offsetof(ATTR_REC, compressed_size);
3030	if (NInoSparse(ni) || NInoCompressed(ni))
3031		name_ofs += sizeof(a->compressed_size);
3032	mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7;
3033	/*
3034	 * Determine the size of the resident part of the now non-resident
3035	 * attribute record.
3036	 */
3037	arec_size = (mp_ofs + mp_size + 7) & ~7;
3038	/*
3039	 * If the page is not uptodate bring it uptodate by copying from the
3040	 * attribute value.
3041	 */
3042	attr_size = le32_to_cpu(a->value_length);
3043	if (attr_size != data_size)
3044		panic("%s(): attr_size != data_size\n", __FUNCTION__);
3045	if (upl && !upl_valid_page(pl, 0)) {
3046		memcpy(kaddr, (u8*)a + le16_to_cpu(a->value_offset),
3047				attr_size);
3048		bzero(kaddr + attr_size, PAGE_SIZE - attr_size);
3049	}
3050	/* Backup the attribute flags. */
3051	old_res_attr_flags = a->resident_flags;
3052retry_resize:
3053	/* Resize the resident part of the attribute record. */
3054	err = ntfs_attr_record_resize(m, a, arec_size);
3055	if (!err) {
3056		al_ofs = 0;
3057		goto do_switch;
3058	}
3059	if (err != ENOSPC)
3060		panic("%s(): err != ENOSPC\n", __FUNCTION__);
3061	/*
3062	 * The attribute record size required cannot be larger than the amount
3063	 * of space in an mft record.
3064	 */
3065	if (arec_size > le32_to_cpu(m->bytes_allocated) -
3066			le16_to_cpu(m->attrs_offset))
3067		panic("%s(): arec_size > le32_to_cpu(m->bytes_allocated) - "
3068				"le16_to_cpu(m->attrs_offset)\n",
3069				__FUNCTION__);
3070	/*
3071	 * To make space in the mft record we would like to try to make other
3072	 * attributes non-resident if that would save space.
3073	 *
3074	 * FIXME: We cannot do this at present unless the attribute is the
3075	 * attribute being resized as there could be an ntfs inode matching
3076	 * this attribute in memory and it would become out of date with its
3077	 * metadata if we touch its attribute record.
3078	 *
3079	 * FIXME: We do not need to do this if this is the attribute being
3080	 * resized as we already tried to make the attribute non-resident and
3081	 * it did not work or we would never have gotten here in the first
3082	 * place.
3083	 *
3084	 * Thus we have to either move other attributes to extent mft records
3085	 * thus making more space in the base mft record or we have to move the
3086	 * attribute being resized to an extent mft record thus giving it more
3087	 * space.  In any case we need to have an attribute list attribute so
3088	 * start by adding it if it does not yet exist.
3089	 *
3090	 * If the addition succeeds but the remapping of the extent mft record
3091	 * fails (i.e. the !err && IS_ERR(ctx.m) case below) we bail out
3092	 * without trying to remove the attribute list attribute because to do
3093	 * so we would have to map the extent mft record in order to move the
3094	 * attribute(s) in it back into the base mft record and we know the
3095	 * mapping just failed so it is unlikely to succeed now.  In any case
3096	 * the metadata is consistent we just cannot make further progress.
3097	 */
3098	if (!NInoAttrList(base_ni)) {
3099		err = ntfs_attr_list_add(base_ni, base_m, &ctx);
3100		if (err || ctx.is_error) {
3101			if (!err)
3102				err = ctx.error;
3103			ntfs_error(vol->mp, "Failed to %s mft_no 0x%llx (error "
3104					"%d).", ctx.is_error ?
3105					"remap extent mft record of" :
3106					"add attribute list attribute to",
3107					(unsigned long long)base_ni->mft_no,
3108					err);
3109			goto unm_err;
3110		}
3111		/*
3112		 * The attribute location will have changed so update it from
3113		 * the search context.
3114		 */
3115		m = ctx.m;
3116		a = ctx.a;
3117		/*
3118		 * Check that the logic in ntfs_attr_list_add() has not changed
3119		 * without the code here being updated.  At present it will
3120		 * never make resident attributes non-resident.
3121		 */
3122		if (a->non_resident)
3123			panic("%s(): a->non_resident\n", __FUNCTION__);
3124		/*
3125		 * We now have an attribute list attribute.  This may have
3126		 * caused the attribute to be made non-resident to be moved out
3127		 * to an extent mft record in which case there would now be
3128		 * enough space to resize the attribute record.
3129		 *
3130		 * Alternatively some other large attribute may have been moved
3131		 * out to an extent mft record thus generating enough space in
3132		 * the base mft record for the attribute to be made
3133		 * non-resident.
3134		 *
3135		 * In either case we simply want to retry the resize.
3136		 */
3137		goto retry_resize;
3138	}
3139	/*
3140	 * We now know we have an attribute list attribute and that we still do
3141	 * not have enough space to make the attribute non-resident.
3142	 *
3143	 * As discussed above we need to start moving attributes out of the
3144	 * base mft record to make enough space.
3145	 *
3146	 * Note that if the attribute to be made non-resident had been moved
3147	 * out of the base mft record we would then have had enough space for
3148	 * the resize thus we would never have gotten here.  We detect this
3149	 * case and BUG() in case we change the logic in ntfs_attr_list_add()
3150	 * some day to remind us to update the code here to match.
3151	 */
3152	if (ctx.ni != base_ni)
3153		panic("%s(): ctx.ni != base_ni\n", __FUNCTION__);
3154	/*
3155	 * If this is the only attribute record in the mft record we cannot
3156	 * gain anything by moving it or anything else.  This really cannot
3157	 * happen as we ensure above that the attribute is in the base mft
3158	 * record.
3159	 */
3160	if (ntfs_attr_record_is_only_one(m, a))
3161		panic("%s(): ntfs_attr_record_is_only_one(m, a)\n",
3162				__FUNCTION__);
3163	/*
3164	 * If the attribute to be resized is the standard information, index
3165	 * root, or unnamed $DATA attribute try to move other attributes out
3166	 * into extent mft records.  If none of these then move the attribute
3167	 * to be resized out to an extent mft record.
3168	 */
3169	type = ni->type;
3170	if (type != AT_STANDARD_INFORMATION && type != AT_INDEX_ROOT &&
3171			(type != AT_DATA || ni->name_len)) {
3172		lck_rw_lock_shared(&base_ni->attr_list_rl.lock);
3173		err = ntfs_attr_record_move(&ctx);
3174		lck_rw_unlock_shared(&base_ni->attr_list_rl.lock);
3175		if (!err) {
3176			/* The attribute has moved so update our variables. */
3177			m = ctx.m;
3178			a = ctx.a;
3179			/* The resize will now succeed. */
3180			goto retry_resize;
3181		}
3182		ntfs_error(vol->mp, "Failed to move attribute type 0x%x out "
3183				"of base mft_no 0x%llx into an extent mft "
3184				"record (error %d).", le32_to_cpu(type),
3185				base_ni->mft_no, err);
3186		goto unm_err;
3187	}
3188	type = AT_UNUSED;
3189	/*
3190	 * The number of free bytes needed in the mft record so the resize can
3191	 * succeed.
3192	 */
3193	bytes_needed = arec_size - le32_to_cpu(a->length);
3194	/*
3195	 * The MFT reference of the mft record in which the attribute to be
3196	 * made non-resident is located.
3197	 */
3198	mref = MK_LE_MREF(base_ni->mft_no, base_ni->seq_no);
3199	al_ofs = base_ni->attr_list_size;
3200	al_end = base_ni->attr_list + al_ofs;
3201next_pass:
3202	ntfs_attr_search_ctx_init(&actx, base_ni, base_m);
3203	actx.is_iteration = 1;
3204	do {
3205		ntfschar *a_name;
3206		ATTR_LIST_ENTRY *al_entry;
3207
3208		/* Get the next attribute in the mft record. */
3209		err = ntfs_attr_find_in_mft_record(type, NULL, 0, NULL, 0,
3210				&actx);
3211		if (err) {
3212			if (err == ENOENT) {
3213				/*
3214				 * If we have more passes to go do the next
3215				 * pass which will try harder to move things
3216				 * out of the way.
3217				 */
3218				if (type == AT_UNUSED) {
3219					type = AT_DATA;
3220					goto next_pass;
3221				}
3222				/*
3223				 * TODO: Need to get these cases triggered and
3224				 * then need to run chkdsk to check for
3225				 * validity of moving these attributes out of
3226				 * the base mft record.
3227				 */
3228				if (type == AT_DATA) {
3229					type = AT_INDEX_ROOT;
3230					goto next_pass;
3231				}
3232				if (type == AT_INDEX_ROOT) {
3233					type = AT_STANDARD_INFORMATION;
3234					goto next_pass;
3235				}
3236				/*
3237				 * We can only get here when the attribute to
3238				 * be made non-resident is the standard
3239				 * information attribute and for some reason it
3240				 * does not exist in the mft record.  That can
3241				 * only happen with some sort of corruption or
3242				 * due to a bug.
3243				 */
3244				ntfs_error(vol->mp, "Standard information "
3245						"attribute is missing from "
3246						"mft_no 0x%llx.  Run chkdsk.",
3247						(unsigned long long)
3248						base_ni->mft_no);
3249				err = EIO;
3250				NVolSetErrors(vol);
3251				goto unm_err;
3252			}
3253			ntfs_error(vol->mp, "Failed to iterate over attribute "
3254					"records in base mft record 0x%llx "
3255					"(error %d).",
3256					(unsigned long long)base_ni->mft_no,
3257					err);
3258			goto unm_err;
3259		}
3260		a = actx.a;
3261		if (type == AT_UNUSED) {
3262			/*
3263			 * Skip the attribute list attribute itself as that is
3264			 * not represented inside itself and we cannot move it
3265			 * out anyway.
3266			 *
3267			 * Also, do not touch standard information, index root,
3268			 * and unnamed $DATA attributes.  They will be moved
3269			 * out to extent mft records in later passes if really
3270			 * necessary.
3271			 */
3272			if (a->type == AT_ATTRIBUTE_LIST ||
3273					a->type == AT_STANDARD_INFORMATION ||
3274					a->type == AT_INDEX_ROOT ||
3275					(a->type == AT_DATA &&
3276					!a->name_length))
3277				continue;
3278		}
3279		/*
3280		 * Move the attribute out to an extent mft record and update
3281		 * its attribute list entry.
3282		 *
3283		 * But first find the attribute list entry matching the
3284		 * attribute record so it can be updated.
3285		 */
3286		a_name = (ntfschar*)((u8*)a + le16_to_cpu(a->name_offset));
3287		al_entry = (ATTR_LIST_ENTRY*)base_ni->attr_list;
3288		do {
3289			/*
3290			 * The attribute must be present in the attribute list
3291			 * attribute or something is corrupt.
3292			 */
3293			if ((u8*)al_entry >= al_end || !al_entry->length) {
3294				ntfs_error(vol->mp, "Attribute type 0x%x not "
3295						"found in attribute list "
3296						"attribute of base mft record "
3297						"0x%llx.  Run chkdsk.",
3298						(unsigned)le32_to_cpu(a->type),
3299						(unsigned long long)
3300						base_ni->mft_no);
3301				NVolSetErrors(vol);
3302				err = EIO;
3303				goto unm_err;
3304			}
3305			if (al_entry->mft_reference == mref &&
3306					al_entry->instance == a->instance) {
3307				/*
3308				 * We found the entry, stop looking but first
3309				 * perform a quick sanity check that we really
3310				 * do have the correct attribute record.
3311				 */
3312				if (al_entry->type == a->type &&
3313						ntfs_are_names_equal(
3314						(ntfschar*)((u8*)al_entry +
3315						al_entry->name_offset),
3316						al_entry->name_length, a_name,
3317						a->name_length, TRUE,
3318						vol->upcase, vol->upcase_len))
3319					break;
3320				ntfs_error(vol->mp, "Found corrupt attribute "
3321						"list attribute when looking "
3322						"for attribute type 0x%x in "
3323						"attribute list attribute of "
3324						"base mft record 0x%llx.  Run "
3325						"chkdsk.",
3326						(unsigned)le32_to_cpu(a->type),
3327						(unsigned long long)
3328						base_ni->mft_no);
3329				NVolSetErrors(vol);
3330				err = EIO;
3331				goto unm_err;
3332			}
3333			/* Go to the next attribute list entry. */
3334			al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
3335					le16_to_cpu(al_entry->length));
3336		} while (1);
3337		/* Finally, move the attribute to an extent record. */
3338		err = ntfs_attr_record_move_for_attr_list_attribute(&actx,
3339				al_entry, &ctx, NULL);
3340		if (err) {
3341			ntfs_error(vol->mp, "Failed to move attribute type "
3342					"0x%x out of base mft record 0x%llx "
3343					"and into an extent mft record (error "
3344					"%d).  Run chkdsk.",
3345					(unsigned)le32_to_cpu(a->type),
3346					(unsigned long long)base_ni->mft_no,
3347					err);
3348			NVolSetErrors(vol);
3349			goto unm_err;
3350		}
3351		/*
3352		 * If the modified attribute list entry is before the current
3353		 * start of attribute list modification we need to sync this
3354		 * entry as well.  For simplicity we just set @al_ofs to the
3355		 * new value thus syncing everything starting at that offset.
3356		 */
3357		if ((u8*)al_entry - base_ni->attr_list < (long)al_ofs) {
3358			al_ofs = (u8*)al_entry - base_ni->attr_list;
3359			al_dirty = TRUE;
3360		}
3361		/*
3362		 * If we moved the attribute to be made non-resident we will
3363		 * now have enough space so retry the resize.
3364		 */
3365		if (ctx.ni != base_ni) {
3366			/*
3367			 * @ctx is not in the base mft record, map the extent
3368			 * inode it is in and if it is mapped at a different
3369			 * address than before update the pointers in @ctx.
3370			 */
3371retry_map:
3372			err = ntfs_mft_record_map(ctx.ni, &m);
3373			if (err) {
3374				/*
3375				 * Something bad has happened.  If out of
3376				 * memory retry till it succeeds.  Any other
3377				 * errors are fatal and we have to abort.
3378				 *
3379				 * We do not need to undo anything as the
3380				 * metadata is self-consistent except for the
3381				 * attribute list attribute which we need to
3382				 * write out.
3383				 */
3384				if (err == ENOMEM) {
3385					(void)thread_block(
3386							THREAD_CONTINUE_NULL);
3387					goto retry_map;
3388				}
3389				ctx.ni = base_ni;
3390				goto unm_err;
3391			}
3392			if (ctx.m != m) {
3393				ctx.a = (ATTR_RECORD*)((u8*)m +
3394						((u8*)ctx.a - (u8*)ctx.m));
3395				ctx.m = m;
3396			}
3397			a = ctx.a;
3398			goto retry_resize;
3399		}
3400		/* If we now have enough space retry the resize. */
3401		if (bytes_needed > le32_to_cpu(m->bytes_allocated) -
3402				le32_to_cpu(m->bytes_in_use)) {
3403			a = ctx.a;
3404			goto retry_resize;
3405		}
3406	} while (1);
3407do_switch:
3408	/*
3409	 * Convert the resident part of the attribute record to describe a
3410	 * non-resident attribute.
3411	 */
3412	a->non_resident = 1;
3413	/* Move the attribute name if it exists and update the offset. */
3414	if (a->name_length)
3415		memmove((u8*)a + name_ofs,
3416				(u8*)a + le16_to_cpu(a->name_offset),
3417				a->name_length * sizeof(ntfschar));
3418	a->name_offset = cpu_to_le16(name_ofs);
3419	/* Setup the fields specific to non-resident attributes. */
3420	a->lowest_vcn = 0;
3421	a->highest_vcn = cpu_to_sle64((new_size - 1) >>
3422			vol->cluster_size_shift);
3423	a->mapping_pairs_offset = cpu_to_le16(mp_ofs);
3424	bzero(&a->reservedN, sizeof(a->reservedN));
3425	a->allocated_size = cpu_to_sle64(new_size);
3426	a->data_size = a->initialized_size = cpu_to_sle64(attr_size);
3427	a->compression_unit = 0;
3428	if (NInoSparse(ni) || NInoCompressed(ni)) {
3429		if (NInoCompressed(ni) || vol->major_ver <= 1)
3430			a->compression_unit = NTFS_COMPRESSION_UNIT;
3431		a->compressed_size = a->allocated_size;
3432	}
3433	/*
3434	 * Generate the mapping pairs array into the attribute record.
3435	 *
3436	 * This cannot fail as we have already checked the size we need to
3437	 * build the mapping pairs array.
3438	 */
3439	err = ntfs_mapping_pairs_build(vol, (s8*)a + mp_ofs, arec_size - mp_ofs,
3440			ni->rl.elements ? ni->rl.rl : NULL, 0, -1, NULL);
3441	if (err)
3442		panic("%s(): err\n", __FUNCTION__);
3443	/* Setup the in-memory attribute structure to be non-resident. */
3444	lck_spin_lock(&ni->size_lock);
3445	ni->allocated_size = new_size;
3446	if (NInoSparse(ni) || NInoCompressed(ni)) {
3447		ni->compressed_size = ni->allocated_size;
3448		if (a->compression_unit) {
3449			ni->compression_block_size = 1U <<
3450					(a->compression_unit +
3451					vol->cluster_size_shift);
3452			ni->compression_block_size_shift =
3453					ffs(ni->compression_block_size) - 1;
3454			ni->compression_block_clusters = 1U <<
3455					a->compression_unit;
3456		} else {
3457			ni->compression_block_size = 0;
3458			ni->compression_block_size_shift = 0;
3459			ni->compression_block_clusters = 0;
3460		}
3461	}
3462	lck_spin_unlock(&ni->size_lock);
3463	/*
3464	 * This needs to be last since we are not allowed to fail once we flip
3465	 * this switch.
3466	 */
3467	NInoSetNonResident(ni);
3468	/* Mark the mft record dirty, so it gets written back. */
3469	NInoSetMrecNeedsDirtying(ctx.ni);
3470	if (ctx.ni != base_ni)
3471		ntfs_extent_mft_record_unmap(ctx.ni);
3472	if (al_dirty) {
3473		ntfs_attr_search_ctx_reinit(&actx);
3474		err = ntfs_attr_list_sync(base_ni, al_ofs, &actx);
3475		if (err) {
3476			ntfs_error(vol->mp, "Failed to write attribute list "
3477					"attribute of mft_no 0x%llx (error "
3478					"%d).  Leaving corrupt metadata.  Run "
3479					"chkdsk.",
3480					(unsigned long long)base_ni->mft_no,
3481					err);
3482			NVolSetErrors(vol);
3483		}
3484		/* Mark the base mft record dirty, so it gets written back. */
3485		NInoSetMrecNeedsDirtying(base_ni);
3486	}
3487	ntfs_mft_record_unmap(base_ni);
3488	lck_rw_unlock_exclusive(&ni->rl.lock);
3489	/*
3490	 * We have modified the allocated size.  If the ntfs inode is the base
3491	 * inode, cause the sizes to be written to all the directory index
3492	 * entries pointing to the base inode when the inode is written to
3493	 * disk.  Do not do this for directories as they have both sizes set to
3494	 * zero in their index entries.
3495	 */
3496	if (ni == base_ni && !S_ISDIR(ni->mode))
3497		NInoSetDirtySizes(ni);
3498	if (upl)
3499		ntfs_page_unmap(ni, upl, pl, TRUE);
3500	ntfs_debug("Done.");
3501	return 0;
3502unm_err:
3503	if (ctx.ni != base_ni) {
3504		NInoSetMrecNeedsDirtying(ctx.ni);
3505		ntfs_extent_mft_record_unmap(ctx.ni);
3506	}
3507	if (al_dirty) {
3508		ntfs_attr_search_ctx_reinit(&actx);
3509		err2 = ntfs_attr_list_sync(base_ni, al_ofs, &actx);
3510		if (err2) {
3511			ntfs_error(vol->mp, "Failed to write attribute list "
3512					"attribute in error code path (error "
3513					"%d).  Leaving corrupt metadata.  Run "
3514					"chkdsk.", err2);
3515			NVolSetErrors(vol);
3516		}
3517	}
3518	NInoSetMrecNeedsDirtying(base_ni);
3519	ntfs_mft_record_unmap(base_ni);
3520rl_err:
3521	if (upl) {
3522		/*
3523		 * If the page was valid release it back to the VM.  If it was
3524		 * not valid throw it away altogether.
3525		 * TODO: We could wrap this up in a ntfs_page_unmap_ext()
3526		 * function which takes an extra parameter to specify whether
3527		 * to keep the page or to dump it if it is invalid...
3528		 */
3529		if (upl_valid_page(pl, 0))
3530			ntfs_page_unmap(ni, upl, pl, FALSE);
3531		else
3532			ntfs_page_dump(ni, upl, pl);
3533	}
3534page_err:
3535	if (ni->rl.elements > 0) {
3536		err2 = ntfs_cluster_free_from_rl(vol, ni->rl.rl, 0, -1, NULL);
3537		if (err2) {
3538			ntfs_error(vol->mp, "Failed to undo cluster "
3539					"allocation (error %d).  Run chkdsk "
3540					"to recover the lost space.", err2);
3541			NVolSetErrors(vol);
3542		}
3543		err2 = ntfs_rl_truncate_nolock(vol, &ni->rl, 0);
3544		if (err2)
3545			panic("%s(): err2\n", __FUNCTION__);
3546	}
3547unl_err:
3548	lck_rw_unlock_exclusive(&ni->rl.lock);
3549	if (err == EINVAL)
3550		err = EIO;
3551	return err;
3552}
3553
3554/**
3555 * ntfs_attr_record_move_for_attr_list_attribute - move an attribute record
3556 * @al_ctx:		search context describing the attribute to move
3557 * @al_entry:		attribute list entry of the attribute to move
3558 * @ctx:		search context of attribute being resized or NULL
3559 * @remap_needed:	[OUT] pointer to remap_needed variable or NULL
3560 *
3561 * Move the attribute described by the attribute search context @al_ctx and
3562 * @al_entry from its mft record to a newly allocated extent mft record and
3563 * update @ctx to reflect this fact (if @ctx is not NULL, otherwise it is
3564 * ignored).
3565 *
3566 * If @ctx is present and is the attribute moved out then set *@remap_needed to
3567 * true.  If the caller is not interested in this then @remap_needed can be set
3568 * to NULL in which case it is ignored.
3569 *
3570 * Return 0 on success and the negative error code on error.
3571 */
3572errno_t ntfs_attr_record_move_for_attr_list_attribute(
3573		ntfs_attr_search_ctx *al_ctx, ATTR_LIST_ENTRY *al_entry,
3574		ntfs_attr_search_ctx *ctx, BOOL *remap_needed)
3575{
3576	ntfs_inode *base_ni, *ni;
3577	MFT_RECORD *m;
3578	ATTR_RECORD *a;
3579	unsigned attr_len;
3580	errno_t err;
3581
3582	base_ni = al_ctx->ni;
3583	ntfs_debug("Entering for mft_no 0x%llx, attribute type 0x%x.",
3584			(unsigned long long)base_ni->mft_no,
3585			(unsigned)le32_to_cpu(al_entry->type));
3586	/*
3587	 * Allocate a new extent mft record, attach it to the base ntfs inode
3588	 * and set up the search context to point to it.
3589	 *
3590	 * FIXME: We should go through all existing extent mft records which
3591	 * will all be attached to @base_ni->extent_nis and for each of them we
3592	 * should map the extent mft record, check for free space and if we
3593	 * find enough free space for the attribute being moved we should move
3594	 * the attribute there instead of allocating a new extent mft record.
3595	 */
3596	err = ntfs_mft_record_alloc(base_ni->vol, NULL, NULL, base_ni, &ni, &m,
3597			&a);
3598	if (err) {
3599		ntfs_error(base_ni->vol->mp, "Failed to move attribute to a "
3600				"new mft record because allocation of the new "
3601				"mft record failed (error %d).", err);
3602		return err;
3603	}
3604	attr_len = le32_to_cpu(al_ctx->a->length);
3605	/* Make space for the attribute extent and copy it into place. */
3606	err = ntfs_attr_record_make_space(m, a, attr_len);
3607	/*
3608	 * This cannot fail as the new mft record must have enough space to
3609	 * hold the attribute record given it fitted inside the old mft record.
3610	 */
3611	if (err)
3612		panic("%s(): err\n", __FUNCTION__);
3613	memcpy(a, al_ctx->a, attr_len);
3614	/* Delete the attribute record from the base mft record. */
3615	ntfs_attr_record_delete_internal(al_ctx->m, al_ctx->a);
3616	/*
3617	 * We moved the attribute out of the mft record thus @al_ctx->a now
3618	 * points to the next attribute.  Since the caller will want to look at
3619	 * that next attribute we set @al_ctx->is_first so that the next call
3620	 * to ntfs_attr_find_in_mft_record() will return the currently pointed
3621	 * at attribute.
3622	 */
3623	al_ctx->is_first = 1;
3624	/*
3625	 * Change the moved attribute record to reflect the new sequence number
3626	 * and the current attribute list attribute entry to reflect the new
3627	 * mft record reference and sequence number.
3628	 */
3629	al_entry->mft_reference = MK_LE_MREF(ni->mft_no, ni->seq_no);
3630	a->instance = al_entry->instance = m->next_attr_instance;
3631	/*
3632	 * Increment the next attribute instance number in the mft record as we
3633	 * consumed the old one.
3634	 */
3635	m->next_attr_instance = cpu_to_le16(
3636			(le16_to_cpu(m->next_attr_instance) + 1) & 0xffff);
3637	/*
3638	 * Ensure the changes make it to disk later and unmap the mft record as
3639	 * we do not need it any more right now.
3640	 */
3641	NInoSetMrecNeedsDirtying(ni);
3642	ntfs_extent_mft_record_unmap(ni);
3643	/*
3644	 * Update @ctx if the attribute it describes is still in the base mft
3645	 * record and the attribute that was deleted was either in front of the
3646	 * attribute described by @ctx or it was the attribute described by
3647	 * @ctx.
3648	 *
3649	 * FIXME: When we fix the above FIXME and we thus start to place
3650	 * multiple attributes in each extent mft record we will need to update
3651	 * @ctx in a more complex fashion here.
3652	 */
3653	if (ctx && ctx->ni == base_ni) {
3654		if ((u8*)al_ctx->a < (u8*)ctx->a)
3655			ctx->a = (ATTR_RECORD*)((u8*)ctx->a - attr_len);
3656		else if (al_ctx->a == ctx->a) {
3657			ctx->m = m;
3658			ctx->a = a;
3659			ctx->ni = ni;
3660			if (remap_needed)
3661				*remap_needed = TRUE;
3662		}
3663	}
3664	ntfs_debug("Done.");
3665	return 0;
3666}
3667
3668/**
3669 * ntfs_attr_record_move - move an attribute record to another mft record
3670 * @ctx:	attribute search context describing the attribute to move
3671 *
3672 * Move the attribute described by the attribute search context @ctx from its
3673 * mft record to a newly allocated extent mft record.  On successful return
3674 * @ctx is setup to point to the moved attribute.
3675 *
3676 * Return 0 on success and the negative error code on error.  On error, the
3677 * attribute search context is invalid and must be either reinitialized or
3678 * released.
3679 *
3680 * NOTE: This function expects that an attribute list attribute is already
3681 * present.
3682 *
3683 * Locking: Caller must hold lock on attribute list attribute runlist, i.e.
3684 *	    @ctx->base_ni->attr_list_rl.lock.
3685 */
3686errno_t ntfs_attr_record_move(ntfs_attr_search_ctx *ctx)
3687{
3688	ntfs_inode *base_ni, *ni;
3689	MFT_RECORD *m;
3690	ATTR_RECORD *a;
3691	u8 *a_copy;
3692	unsigned attr_len;
3693	errno_t err, err2;
3694	ntfs_attr_search_ctx al_ctx;
3695	static const char es[] = "  Leaving inconsistent metadata.  Unmount "
3696			"and run chkdsk.";
3697
3698	base_ni = ctx->base_ni;
3699	if (!base_ni || !NInoAttrList(base_ni))
3700		panic("%s(): !base_ni || !NInoAttrList(base_ni)\n",
3701				__FUNCTION__);
3702	ni = ctx->ni;
3703	m = ctx->m;
3704	a = ctx->a;
3705	ntfs_debug("Entering for base mft_no 0x%llx, extent mft_no 0x%llx, "
3706			"attribute type 0x%x.",
3707			(unsigned long long)base_ni->mft_no,
3708			(unsigned long long)ni->mft_no,
3709			(unsigned)le32_to_cpu(a->type));
3710	attr_len = le32_to_cpu(a->length);
3711	/* Allocate a temporary buffer to hold the attribute to be moved. */
3712	a_copy = OSMalloc(attr_len, ntfs_malloc_tag);
3713	if (!a_copy) {
3714		ntfs_error(ni->vol->mp, "Not enough memory to allocate "
3715				"temporary attribute buffer.");
3716		return ENOMEM;
3717	}
3718	/*
3719	 * Copy the attribute to the temporary buffer and delete it from its
3720	 * original mft record.
3721	 */
3722	memcpy(a_copy, a, attr_len);
3723	ntfs_attr_record_delete_internal(m, a);
3724	/*
3725	 * This function will never be called if the attribute is the only
3726	 * attribute in the mft record as this would not gain anything thus
3727	 * report a bug in this case.
3728	 */
3729	if (((ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset)))->type ==
3730			AT_END)
3731		panic("%s(): Is only attribute in mft record!\n", __FUNCTION__);
3732	/* Ensure the changes make it to disk later. */
3733	NInoSetMrecNeedsDirtying(ni);
3734	/*
3735	 * We have finished with this mft record thus if it is an extent mft
3736	 * record we release it.  We do this by hand as we want to keep the
3737	 * current attribute list attribute entry.
3738	 */
3739	if (ni != base_ni)
3740		ntfs_extent_mft_record_unmap(ni);
3741	/*
3742	 * Find the attribute list attribute in the base mft record.  Doing
3743	 * this now hugely simplifies error handling.
3744	 */
3745	ntfs_attr_search_ctx_init(&al_ctx, base_ni, ctx->base_m);
3746	err = ntfs_attr_find_in_mft_record(AT_ATTRIBUTE_LIST, AT_UNNAMED, 0,
3747			NULL, 0, &al_ctx);
3748	if (err) {
3749		ntfs_error(base_ni->vol->mp, "Failed to move attribute to a "
3750				"new mft record because looking up the "
3751				"attribute list attribute in the base inode "
3752				"failed (error %d).", err);
3753		goto undo_delete;
3754	}
3755	/*
3756	 * Allocate a new extent mft record, attach it to the base ntfs inode
3757	 * and set up the search context to point to it.
3758	 */
3759	err = ntfs_mft_record_alloc(base_ni->vol, NULL, NULL, base_ni, &ni, &m,
3760			&a);
3761	if (err) {
3762		ntfs_error(base_ni->vol->mp, "Failed to move attribute to a "
3763				"new mft record because allocation of the new "
3764				"mft record failed (error %d).", err);
3765		goto undo_delete;
3766	}
3767	ctx->ni = ni;
3768	ctx->m = m;
3769	ctx->a = a;
3770	/* Make space for the attribute extent and copy it into place. */
3771	err = ntfs_attr_record_make_space(m, a, attr_len);
3772	/*
3773	 * This cannot fail as the new mft record must have enough space to
3774	 * hold the attribute record given it fitted inside the old mft record.
3775	 */
3776	if (err)
3777		panic("%s(): err (ntfs_attr_record_make_space())\n",
3778				__FUNCTION__);
3779	memcpy(a, a_copy, attr_len);
3780	/* We do not need the temporary buffer any more. */
3781	OSFree(a_copy, attr_len, ntfs_malloc_tag);
3782	/*
3783	 * Change the moved attribute record to reflect the new sequence number
3784	 * and the current attribute list attribute entry to reflect the new
3785	 * mft record reference and sequence number.
3786	 */
3787	ctx->al_entry->mft_reference = MK_LE_MREF(ni->mft_no, ni->seq_no);
3788	a->instance = ctx->al_entry->instance = m->next_attr_instance;
3789	/*
3790	 * Increment the next attribute instance number in the mft record as we
3791	 * consumed the old one.
3792	 */
3793	m->next_attr_instance = cpu_to_le16(
3794			(le16_to_cpu(m->next_attr_instance) + 1) & 0xffff);
3795	/* Ensure the changes make it to disk later. */
3796	NInoSetMrecNeedsDirtying(ni);
3797	/*
3798	 * Finally, sync the modified attribute list attribute from its in-
3799	 * memory buffer to the on-disk metadata.
3800	 */
3801	a = al_ctx.a;
3802	if (a->non_resident) {
3803		unsigned ofs;
3804
3805		ofs = (u8*)ctx->al_entry - base_ni->attr_list;
3806		err = ntfs_rl_write(base_ni->vol, base_ni->attr_list,
3807				base_ni->attr_list_size,
3808				&base_ni->attr_list_rl, ofs,
3809				le16_to_cpu(ctx->al_entry->length));
3810		if (err) {
3811			ntfs_error(base_ni->vol->mp, "Failed to update "
3812					"on-disk attribute list attribute of "
3813					"mft_no 0x%llx (error %d).%s",
3814					(unsigned long long)base_ni->mft_no,
3815					err, es);
3816			return err;
3817		}
3818	} else {
3819		ATTR_LIST_ENTRY *al_entry;
3820
3821		al_entry = (ATTR_LIST_ENTRY*)((u8*)a +
3822				le16_to_cpu(a->value_offset) +
3823				((u8*)ctx->al_entry - base_ni->attr_list));
3824		al_entry->mft_reference = ctx->al_entry->mft_reference;
3825		al_entry->instance = ctx->al_entry->instance;
3826		/* Ensure the changes make it to disk later. */
3827		NInoSetMrecNeedsDirtying(base_ni);
3828	}
3829	ntfs_debug("Done.");
3830	return 0;
3831undo_delete:
3832	/*
3833	 * Map the old mft record again (if we unmapped it) and re-insert the
3834	 * deleted attribute record in its old place.
3835	 */
3836	ni = ctx->ni;
3837	if (ni != base_ni) {
3838		err2 = ntfs_mft_record_map(ni, &m);
3839		if (err2) {
3840			/*
3841			 * Make it safe to release the attribute search
3842			 * context.
3843			 */
3844			ctx->ni = base_ni;
3845			ntfs_error(base_ni->vol->mp, "Failed to restore "
3846					"attribute in mft_no 0x%llx after "
3847					"allocation failure (error %d).%s",
3848					(unsigned long long)base_ni->mft_no,
3849					err2, es);
3850			NVolSetErrors(base_ni->vol);
3851			goto err;
3852		}
3853		/*
3854		 * If the extent mft record was mapped into a different
3855		 * address, adjust the mft record and attribute record pointers
3856		 * in the search context.
3857		 */
3858		if (m != ctx->m) {
3859			ctx->a = (ATTR_RECORD*)((u8*)m + ((u8*)ctx->a -
3860					(u8*)ctx->m));
3861			ctx->m = m;
3862		}
3863	}
3864	/*
3865	 * Creating space for the attribute in its old mft record cannot fail
3866	 * because we only just deleted the attribute from the mft record thus
3867	 * there must be enough space in it.
3868	 */
3869	err2 = ntfs_attr_record_make_space(ctx->m, ctx->a, attr_len);
3870	if (err2)
3871		panic("%s(): err2\n", __FUNCTION__);
3872	memcpy(ctx->a, a_copy, attr_len);
3873	/* Ensure the changes make it to disk later. */
3874	NInoSetMrecNeedsDirtying(ni);
3875err:
3876	OSFree(a_copy, attr_len, ntfs_malloc_tag);
3877	return err;
3878}
3879
3880/**
3881 * ntfs_attr_set_initialized_size - extend the initialized size of an attribute
3882 * @ni:			ntfs inode whose sizes to extend
3883 * @new_init_size:	the new initialized size to set @ni to or -1
3884 *
3885 * If @new_init_size is >= 0, set the initialized size in the ntfs inode @ni
3886 * to @new_init_size.  Otherwise ignore @new_init_size and do not change the
3887 * initialized size in @ni.
3888 *
3889 * If the new initialized size is bigger than the data size of the ntfs inode,
3890 * update the data size to equal the initialized size.  In this case also set
3891 * the size in the ubc.
3892 *
3893 * Then, set the data and initialized sizes in the attribute record of the
3894 * attribute specified by the ntfs inode @ni to the values in the ntfs inode
3895 * @ni.
3896 *
3897 * Thus, if @new_init_size is >= 0, both @ni and its underlying attribute have
3898 * their initialized size set to @new_init_size and if @new_init_size is < 0,
3899 * the underlying attribute initialized size is set to the initialized size of
3900 * the ntfs inode @ni.
3901 *
3902 * Note the caller is responsible for any zeroing that needs to happen between
3903 * the old initialized size and @new_init_size.
3904 *
3905 * Note when this function is called for resident attributes it requires that
3906 * the initialized size equals the data size as anything else does not make
3907 * sense for resident attributes.  Further, @new_init_size must be >= 0, i.e. a
3908 * specific value must be provided as the call would otherwise be pointless as
3909 * there is no such thing as an initialized size for resident attributes.
3910 *
3911 * Return 0 on success and errno on error.
3912 *
3913 * Locking: The caller must hold @ni->lock on the inode for writing.
3914 */
3915errno_t ntfs_attr_set_initialized_size(ntfs_inode *ni, s64 new_init_size)
3916{
3917	ntfs_inode *base_ni;
3918	MFT_RECORD *m;
3919	ntfs_attr_search_ctx *ctx;
3920	ATTR_RECORD *a;
3921	errno_t err;
3922	BOOL data_size_updated = FALSE;
3923
3924#ifdef DEBUG
3925	lck_spin_lock(&ni->size_lock);
3926	ntfs_debug("Entering for mft_no 0x%llx, attribute type 0x%x, old data "
3927			"size 0x%llx, old initialized size 0x%llx, new "
3928			"initialized size 0x%llx.",
3929			(unsigned long long)ni->mft_no,
3930			(unsigned)le32_to_cpu(ni->type),
3931			(unsigned long long)ni->data_size,
3932			(unsigned long long)ni->initialized_size,
3933			(unsigned long long)new_init_size);
3934	lck_spin_unlock(&ni->size_lock);
3935#endif /* DEBUG */
3936	base_ni = ni;
3937	if (NInoAttr(ni))
3938		base_ni = ni->base_ni;
3939	/* Map, pin, and lock the mft record. */
3940	err = ntfs_mft_record_map(base_ni, &m);
3941	if (err)
3942		goto err;
3943	ctx = ntfs_attr_search_ctx_get(base_ni, m);
3944	if (!ctx) {
3945		err = ENOMEM;
3946		goto unm_err;
3947	}
3948	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 0, NULL, 0,
3949			ctx);
3950	if (err) {
3951		if (err == ENOENT)
3952			err = EIO;
3953		goto put_err;
3954	}
3955	a = ctx->a;
3956	lck_spin_lock(&ni->size_lock);
3957	if (new_init_size >= 0) {
3958		if (new_init_size < ni->initialized_size)
3959			panic("%s(): new_init_size < ni->initialized_size\n",
3960					__FUNCTION__);
3961		/*
3962		 * If the new initialized size exceeds the data size extend the
3963		 * data size to cover the new initialized size.
3964		 */
3965		if (new_init_size > ni->data_size) {
3966			ni->data_size = new_init_size;
3967			if (a->non_resident)
3968				a->data_size = cpu_to_sle64(new_init_size);
3969			else {
3970				if (NInoNonResident(ni))
3971					panic("%s(): NInoNonResident(ni)\n",
3972							__FUNCTION__);
3973				if (new_init_size >> 32)
3974					panic("%s(): new_init_size >> 32\n",
3975							__FUNCTION__);
3976				if (new_init_size > le32_to_cpu(a->length) -
3977						le16_to_cpu(a->value_offset))
3978					panic("%s(): new_init_size > "
3979							"le32_to_cpu("
3980							"a->length) - "
3981							"le16_to_cpu("
3982							"a->value_offset)\n",
3983							__FUNCTION__);
3984				a->value_length = cpu_to_le32(new_init_size);
3985			}
3986			data_size_updated = TRUE;
3987			if (ni == base_ni && !S_ISDIR(ni->mode))
3988				NInoSetDirtySizes(ni);
3989		}
3990		ni->initialized_size = new_init_size;
3991	} else {
3992		if (!a->non_resident)
3993			panic("%s(): !a->non_resident\n", __FUNCTION__);
3994		if (ni->initialized_size > ni->data_size)
3995			panic("%s(): ni->initialized_size > ni->data_size\n",
3996					__FUNCTION__);
3997		new_init_size = ni->initialized_size;
3998	}
3999	if (a->non_resident) {
4000		if (!NInoNonResident(ni))
4001			panic("%s(): !NInoNonResident(ni)\n", __FUNCTION__);
4002		a->initialized_size = cpu_to_sle64(new_init_size);
4003	}
4004	lck_spin_unlock(&ni->size_lock);
4005	/*
4006	 * If this is a directory B+tree index allocation attribute also update
4007	 * the sizes in the base inode.
4008	 */
4009	if (ni->name == I30 && ni->type == AT_INDEX_ALLOCATION) {
4010		lck_spin_lock(&base_ni->size_lock);
4011		if (data_size_updated)
4012			base_ni->data_size = new_init_size;
4013		base_ni->initialized_size = new_init_size;
4014		lck_spin_unlock(&base_ni->size_lock);
4015	}
4016	/* Mark the mft record dirty to ensure it gets written out. */
4017	NInoSetMrecNeedsDirtying(ctx->ni);
4018put_err:
4019	ntfs_attr_search_ctx_put(ctx);
4020unm_err:
4021	ntfs_mft_record_unmap(base_ni);
4022	if (data_size_updated) {
4023		if (!ubc_setsize(ni->vn, new_init_size))
4024			panic("%s(): ubc_setsize() failed.\n", __FUNCTION__);
4025	}
4026	if (!err)
4027		ntfs_debug("Done.");
4028	else {
4029err:
4030		ntfs_error(ni->vol->mp, "Failed (error %d).", err);
4031	}
4032	return err;
4033}
4034
4035/**
4036 * ntfs_attr_extend_initialized - extend the initialized size of an attribute
4037 * @ni:			ntfs inode of the attribute to extend
4038 * @new_init_size:	requested new initialized size in bytes
4039 *
4040 * Extend the initialized size of an attribute described by the ntfs inode @ni
4041 * to @new_init_size bytes.  This involves zeroing any non-sparse space between
4042 * the old initialized size and @new_init_size both in the VM page cache and on
4043 * disk (if relevant complete pages are already uptodate in the VM page cache
4044 * then these are simply marked dirty).
4045 *
4046 * As a side-effect, the data size as well as the ubc size may be incremented
4047 * as, in the resident attribute case, it is tied to the initialized size and,
4048 * in the non-resident attribute case, it may not fall below the initialized
4049 * size.
4050 *
4051 * Note that if the attribute is resident, we do not need to touch the VM page
4052 * cache at all.  This is because if the VM page is not uptodate we bring it
4053 * uptodate later, when doing the write to the mft record since we then already
4054 * have the page mapped.  And if the page is uptodate, the non-initialized
4055 * region will already have been zeroed when the page was brought uptodate and
4056 * the region may in fact already have been overwritten with new data via
4057 * mmap() based writes, so we cannot just zero it.  And since POSIX specifies
4058 * that the behaviour of resizing a file whilst it is mmap()ped is unspecified,
4059 * we choose not to do zeroing and thus we do not need to touch the VM page at
4060 * all.
4061 *
4062 * Return 0 on success and errno on error.  In the case that an error is
4063 * encountered it is possible that the initialized size and/or the data size
4064 * will already have been incremented some way towards @new_init_size but it is
4065 * guaranteed that if this is the case, the necessary zeroing will also have
4066 * happened and that all metadata is self-consistent.
4067 *
4068 * Locking: - Caller must hold @ni->lock on the inode for writing.
4069 *	    - The runlist @ni must be unlocked as it is taken for writing.
4070 */
4071errno_t ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size)
4072{
4073	VCN vcn, end_vcn;
4074	s64 size, old_init_size, ofs;
4075	ntfs_volume *vol;
4076	ntfs_inode *base_ni;
4077	MFT_RECORD *m;
4078	ATTR_RECORD *a;
4079	ntfs_attr_search_ctx *ctx;
4080	u8 *kattr;
4081	ntfs_rl_element *rl = NULL;
4082	errno_t err;
4083	unsigned attr_len;
4084	BOOL locked, write_locked, is_sparse, mark_sizes_dirty;
4085
4086	lck_spin_lock(&ni->size_lock);
4087	if (new_init_size > ni->allocated_size)
4088		panic("%s(): new_init_size > ni->allocated_size\n",
4089				__FUNCTION__);
4090	size = ni->data_size;
4091	old_init_size = ni->initialized_size;
4092	lck_spin_unlock(&ni->size_lock);
4093	if (new_init_size <= old_init_size)
4094		panic("%s(): new_init_size <= old_init_size\n",
4095				__FUNCTION__);
4096	mark_sizes_dirty = write_locked = FALSE;
4097	vol = ni->vol;
4098	ntfs_debug("Entering for mft_no 0x%llx, old initialized size 0x%llx, "
4099			"new initialized size 0x%llx, old data size 0x%llx.",
4100			(unsigned long long)ni->mft_no,
4101			(unsigned long long)old_init_size,
4102			(unsigned long long)new_init_size,
4103			(unsigned long long)size);
4104	base_ni = ni;
4105	if (NInoAttr(ni))
4106		base_ni = ni->base_ni;
4107	/* Use goto to reduce indentation and we need the label below anyway. */
4108	if (NInoNonResident(ni))
4109		goto do_non_resident_extend;
4110	if (old_init_size != size)
4111		panic("%s(): old_init_size != size\n", __FUNCTION__);
4112	/* Map, pin, and lock the mft record. */
4113	err = ntfs_mft_record_map(base_ni, &m);
4114	if (err)
4115		goto err;
4116	ctx = ntfs_attr_search_ctx_get(base_ni, m);
4117	if (!ctx) {
4118		err = ENOMEM;
4119		goto unm_err;
4120	}
4121	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 0, NULL, 0,
4122			ctx);
4123	if (err) {
4124		if (err == ENOENT)
4125			err = EIO;
4126		goto put_err;
4127	}
4128	a = ctx->a;
4129	if (a->non_resident)
4130		panic("%s(): a->non_resident\n", __FUNCTION__);
4131	/* The total length of the attribute value. */
4132	attr_len = le32_to_cpu(a->value_length);
4133	if (size != attr_len)
4134		panic("%s(): size != attr_len\n", __FUNCTION__);
4135	/*
4136	 * Do the zeroing in the mft record and update the attribute size in
4137	 * the mft record.
4138	 */
4139	kattr = (u8*)a + le16_to_cpu(a->value_offset);
4140	bzero(kattr + attr_len, new_init_size - attr_len);
4141	a->value_length = cpu_to_le32((u32)new_init_size);
4142	/* Update the sizes in the ntfs inode as well as the ubc size. */
4143	lck_spin_lock(&ni->size_lock);
4144	ni->initialized_size = ni->data_size = size = new_init_size;
4145	lck_spin_unlock(&ni->size_lock);
4146	/* Mark the mft record dirty to ensure it gets written out. */
4147	NInoSetMrecNeedsDirtying(ctx->ni);
4148	ntfs_attr_search_ctx_put(ctx);
4149	ntfs_mft_record_unmap(base_ni);
4150	ubc_setsize(ni->vn, new_init_size);
4151	mark_sizes_dirty = TRUE;
4152	goto done;
4153do_non_resident_extend:
4154	/*
4155	 * If the new initialized size @new_init_size exceeds the current data
4156	 * size we need to extend the file size to the new initialized size.
4157	 */
4158	if (new_init_size > size) {
4159		/* Map, pin, and lock the mft record. */
4160		err = ntfs_mft_record_map(base_ni, &m);
4161		if (err)
4162			goto err;
4163		ctx = ntfs_attr_search_ctx_get(base_ni, m);
4164		if (!ctx) {
4165			err = ENOMEM;
4166			goto unm_err;
4167		}
4168		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 0,
4169				NULL, 0, ctx);
4170		if (err) {
4171			if (err == ENOENT)
4172				err = EIO;
4173			goto put_err;
4174		}
4175		a = ctx->a;
4176		if (!a->non_resident)
4177			panic("%s(): !a->non_resident\n", __FUNCTION__);
4178		if (size != sle64_to_cpu(a->data_size))
4179			panic("%s(): size != sle64_to_cpu(a->data_size)\n",
4180					__FUNCTION__);
4181		size = new_init_size;
4182		lck_spin_lock(&ni->size_lock);
4183		ni->data_size = new_init_size;
4184		lck_spin_unlock(&ni->size_lock);
4185		a->data_size = cpu_to_sle64(new_init_size);
4186		/* Mark the mft record dirty to ensure it gets written out. */
4187		NInoSetMrecNeedsDirtying(ctx->ni);
4188		ntfs_attr_search_ctx_put(ctx);
4189		ntfs_mft_record_unmap(base_ni);
4190		mark_sizes_dirty = TRUE;
4191		ubc_setsize(ni->vn, new_init_size);
4192	}
4193	/*
4194	 * If the attribute is not sparse we can simply map each page between
4195	 * the old initialized size and the new initialized size which takes
4196	 * care of any needed zeroing and then unmap the page again marking it
4197	 * dirty so the VM later causes it to be written out.
4198	 *
4199	 * If the file is sparse on the other hand things are more complicated
4200	 * because we want to skip any sparse regions because mapping a sparse
4201	 * page and then unmapping it again and marking it dirty would cause
4202	 * the hole to be filled when the page is written out.
4203	 *
4204	 * Thus for sparse files we walk the runlist before we start doing
4205	 * anything and check whether there are any sparse regions between the
4206	 * old initialized size and the new initialized size.  If there are no
4207	 * sparse regions we can simply proceed as if this attribute was not
4208	 * sparse.
4209	 *
4210	 * If there are sparse regions then we ensure that all runlist
4211	 * fragments between the old initialized size and new initialized size
4212	 * are mapped and then we hold the runlist lock shared and walk the
4213	 * runlist and only for non-sparse regions do we do the page mapping,
4214	 * unmapping and dirtying.
4215	 */
4216	ofs = old_init_size & ~PAGE_MASK_64;
4217	write_locked = locked = FALSE;
4218	is_sparse = (NInoSparse(ni));
4219	if (is_sparse) {
4220		BOOL have_holes = FALSE;
4221
4222		locked = TRUE;
4223		lck_rw_lock_shared(&ni->rl.lock);
4224		vcn = ofs >> vol->cluster_size_shift;
4225		end_vcn = (new_init_size + vol->cluster_size_mask) >>
4226				vol->cluster_size_shift;
4227retry_remap:
4228		rl = ni->rl.rl;
4229		if (!ni->rl.elements || vcn < rl->vcn || !rl->length) {
4230map_vcn:
4231			if (!write_locked) {
4232				write_locked = TRUE;
4233				if (!lck_rw_lock_shared_to_exclusive(
4234						&ni->rl.lock)) {
4235					lck_rw_lock_exclusive(&ni->rl.lock);
4236					goto retry_remap;
4237				}
4238			}
4239			/* Need to map the runlist fragment containing @vcn. */
4240			err = ntfs_map_runlist_nolock(ni, vcn, NULL);
4241			if (err) {
4242				ntfs_error(vol->mp, "Failed to map runlist "
4243						"fragment (error %d).", err);
4244				if (err == EINVAL)
4245					err = EIO;
4246				goto unl_err;
4247			}
4248			rl = ni->rl.rl;
4249			if (!ni->rl.elements || vcn < rl->vcn || !rl->length)
4250				panic("%s(): !ni->rl.elements || "
4251						"vcn < rl[0].vcn || "
4252						"!rl->length\n", __FUNCTION__);
4253		}
4254		/* Seek to the runlist element containing @vcn. */
4255		while (rl->length && vcn >= rl[1].vcn)
4256			rl++;
4257		do {
4258			/*
4259			 * If this run is not mapped map it now and start again
4260			 * as the runlist will have been updated.
4261			 */
4262			if (rl->lcn == LCN_RL_NOT_MAPPED) {
4263				vcn = rl->vcn;
4264				goto map_vcn;
4265			}
4266			/* If this run is not valid abort with an error. */
4267			if (!rl->length || rl->lcn < LCN_HOLE)
4268				goto rl_err;
4269			if (rl->lcn == LCN_HOLE) {
4270				have_holes = TRUE;
4271				/*
4272				 * If the current initialized size is inside
4273				 * the current run we can move the initialized
4274				 * size forward to the end of this run taking
4275				 * care not to go beyond the new initialized
4276				 * size.
4277				 *
4278				 * Note we also have to take care not to move
4279				 * the initialized size backwards thus we only
4280				 * have to update the initialized size if the
4281				 * current offset is above the old initialized
4282				 * size.
4283				 */
4284				if (ofs >> vol->cluster_size_shift >= rl->vcn) {
4285					ofs = rl[1].vcn <<
4286							vol->cluster_size_shift;
4287					if (ofs > old_init_size) {
4288						if (ofs > new_init_size)
4289							ofs = new_init_size;
4290						lck_spin_lock(&ni->size_lock);
4291						ni->initialized_size = ofs;
4292						lck_spin_unlock(&ni->size_lock);
4293						if (ofs == new_init_size)
4294							goto update_done;
4295					}
4296				}
4297			}
4298			/* Proceed to the next run. */
4299			rl++;
4300		} while (rl->vcn < end_vcn);
4301		/*
4302		 * If we encountered sparse regions in the runlist then we need
4303		 * to keep the runlist lock shared.
4304		 *
4305		 * If there were no sparse regions we do not need the runlist
4306		 * lock at all any more so we release it and we pretend this
4307		 * attribute is not sparse.
4308		 */
4309		if (have_holes) {
4310			if (write_locked) {
4311				lck_rw_lock_exclusive_to_shared(&ni->rl.lock);
4312				write_locked = FALSE;
4313			}
4314			/*
4315			 * We may have moved @ofs forward in which case it will
4316			 * be cluster aligned instead of page aligned and the
4317			 * two are not equal when the cluster size is less than
4318			 * the page size so we need to align at @ofs to the
4319			 * page size again.
4320			 */
4321			ofs &= ~PAGE_MASK_64;
4322			rl = ni->rl.rl;
4323		} else {
4324			if (write_locked)
4325				lck_rw_unlock_exclusive(&ni->rl.lock);
4326			else
4327				lck_rw_unlock_shared(&ni->rl.lock);
4328			locked = FALSE;
4329			is_sparse = FALSE;
4330		}
4331	}
4332	do {
4333		/*
4334		 * If the file is sparse, check if the current page is
4335		 * completely sparse and if so skip it.
4336		 *
4337		 * Otherwise take care of zeroing the uninitialized region.
4338		 */
4339		if (is_sparse) {
4340			/* We need to update @vcn to the current offset @ofs. */
4341			vcn = ofs >> vol->cluster_size_shift;
4342			/* Determine the first VCN outside the current page. */
4343			end_vcn = (ofs + PAGE_SIZE + vol->cluster_size_mask) >>
4344					vol->cluster_size_shift;
4345			/* Seek to the runlist element containing @vcn. */
4346			while (rl->length && vcn >= rl[1].vcn)
4347				rl++;
4348			/* If this run is not valid abort with an error. */
4349			if (!rl->length || rl->lcn < LCN_HOLE)
4350				goto rl_err;
4351			/*
4352			 * @rl is the runlist element containing @ofs, the
4353			 * current initialized size, and the current @vcn.
4354			 *
4355			 * Check whether the current page is completely sparse.
4356			 * This is complicated slightly by the fact that a page
4357			 * can span multiple clusters when the cluster size is
4358			 * less than the page size.
4359			 *
4360			 * As an optimization when a sparse run spans more than
4361			 * one page we forward both @ofs and the initialized
4362			 * size to the end of the run (ensuring it is page
4363			 * aligned).
4364			 */
4365			do {
4366				if (rl->lcn >= 0) {
4367					/* This page is not entirely sparse. */
4368					goto on_disk_page;
4369				}
4370				/* Proceed to the next run. */
4371				rl++;
4372				vcn = rl->vcn;
4373			} while (vcn < end_vcn && rl->length);
4374			/*
4375			 * The page is entirely sparse.
4376			 *
4377			 * Check how many pages are entirely sparse and move
4378			 * the initialized size up to the end of the sparse
4379			 * region ensuring we maintain page alignment.
4380			 */
4381			while (rl->lcn == LCN_HOLE && rl->length)
4382				rl++;
4383			ofs = (rl->vcn << vol->cluster_size_shift) &
4384					~PAGE_MASK_64;
4385			/*
4386			 * Update the initialized size in the ntfs inode.  This
4387			 * is enough to make ntfs_vnop_pageout() work.  We
4388			 * could postpone this until we actually are going to
4389			 * unmap a page or we have reached the end of the
4390			 * region to be initialized but we do it now to
4391			 * minimize our impact on processes that are performing
4392			 * concurrent mmap() based writes to this attribute.
4393			 *
4394			 * FIXME: This is not actually true as the caller is
4395			 * holding the ntfs inode lock for writing thus no
4396			 * pageouts on this inode can occur at all.  We
4397			 * probably need to fix this so we cannot bring the
4398			 * system out of memory.
4399			 */
4400			if (ofs > new_init_size)
4401				ofs = new_init_size;
4402			lck_spin_lock(&ni->size_lock);
4403			ni->initialized_size = ofs;
4404			lck_spin_unlock(&ni->size_lock);
4405		} else /* if (!is_sparse) */ {
4406			upl_t upl;
4407			upl_page_info_array_t pl;
4408
4409on_disk_page:
4410			/*
4411			 * Read the page.  If the page is not present,
4412			 * ntfs_page_map() will zero the uninitialized/sparse
4413			 * regions for us.
4414			 *
4415			 * TODO: An optimization would be to do things by hand
4416			 * taking advantage of dealing with multiple pages at
4417			 * once instead of working one page at a time.
4418			 *
4419			 * FIXME: We are potentially creating a lot of dirty
4420			 * pages here and since the caller is holding the ntfs
4421			 * inode lock for writing no pageouts on this inode can
4422			 * occur at all.  We probably need to fix this so we
4423			 * cannot bring the system out of memory.
4424			 */
4425// TODO: This should never happen.  Just adding it so we can detect if we were
4426// going to deadlock.  If it triggers need to fix it in the code so it does
4427// not.  Or perhaps just remove the warning and use this as the solution.
4428			if (locked && write_locked) {
4429				write_locked = FALSE;
4430				lck_rw_lock_exclusive_to_shared(&ni->rl.lock);
4431				ntfs_warning(vol->mp, "Switching runlist lock "
4432						"to shared to avoid "
4433						"deadlock.");
4434			}
4435			err = ntfs_page_map(ni, ofs, &upl, &pl, &kattr, TRUE);
4436			if (err)
4437				goto unl_err;
4438			/*
4439			 * Update the initialized size in the ntfs inode.  This
4440			 * is enough to make ntfs_vnop_pageout() work.
4441			 */
4442			ofs += PAGE_SIZE;
4443			if (ofs > new_init_size)
4444				ofs = new_init_size;
4445			lck_spin_lock(&ni->size_lock);
4446			ni->initialized_size = ofs;
4447			lck_spin_unlock(&ni->size_lock);
4448			/* Set the page dirty so it gets written out. */
4449			ntfs_page_unmap(ni, upl, pl, TRUE);
4450		}
4451	} while (ofs < new_init_size);
4452	lck_spin_lock(&ni->size_lock);
4453	if (ni->initialized_size != new_init_size)
4454		panic("%s(): ni->initialized_size != new_init_size\n",
4455				__FUNCTION__);
4456	lck_spin_unlock(&ni->size_lock);
4457update_done:
4458	/* If we are holding the runlist lock, release it now. */
4459	if (locked) {
4460		if (write_locked)
4461			lck_rw_unlock_exclusive(&ni->rl.lock);
4462		else
4463			lck_rw_unlock_shared(&ni->rl.lock);
4464		locked = FALSE;
4465	}
4466	/* Bring up to date the initialized_size in the attribute record. */
4467	err = ntfs_attr_set_initialized_size(ni, -1);
4468	if (err)
4469		goto unl_err;
4470done:
4471	/*
4472	 * If we have modified the size of the base inode, cause the sizes to
4473	 * be written to all the directory index entries pointing to the base
4474	 * inode when the inode is written to disk.
4475	 */
4476	if (mark_sizes_dirty && ni == base_ni && !S_ISDIR(ni->mode))
4477		NInoSetDirtySizes(ni);
4478	ntfs_debug("Done, new initialized size 0x%llx, new data size 0x%llx.",
4479			(unsigned long long)new_init_size,
4480			(unsigned long long)size);
4481	return 0;
4482rl_err:
4483	ntfs_error(vol->mp, "Runlist is corrupt.  Unmount and run chkdsk.");
4484	NVolSetErrors(vol);
4485	err = EIO;
4486unl_err:
4487	if (locked) {
4488		if (write_locked)
4489			lck_rw_unlock_exclusive(&ni->rl.lock);
4490		else
4491			lck_rw_unlock_shared(&ni->rl.lock);
4492	}
4493	lck_spin_lock(&ni->size_lock);
4494	ni->initialized_size = old_init_size;
4495	lck_spin_unlock(&ni->size_lock);
4496	goto err;
4497put_err:
4498	ntfs_attr_search_ctx_put(ctx);
4499unm_err:
4500	ntfs_mft_record_unmap(base_ni);
4501err:
4502	ntfs_debug("Failed (error %d).", err);
4503	return err;
4504}
4505
4506/**
4507 * ntfs_attr_sparse_set - switch an attribute to be sparse
4508 * @base_ni:	base ntfs inode to which the attribute belongs
4509 * @ni:		ntfs inode of attribute which to cause to be sparse
4510 * @ctx:	attribute search context describing the attribute to work on
4511 *
4512 * Switch the non-sparse, base attribute described by @ni and @ctx belonging to
4513 * the base ntfs inode @base_ni to be sparse.
4514 *
4515 * Return 0 on success and errno on error.
4516 *
4517 * Note that the attribute may be moved to be able to extend it when adding the
4518 * compressed size.  Thus any cached values of @ctx->ni, @ctx->m, and @ctx->a
4519 * are invalid after this function returns.
4520 */
4521static errno_t ntfs_attr_sparse_set(ntfs_inode *base_ni, ntfs_inode *ni,
4522		ntfs_attr_search_ctx *ctx)
4523{
4524#if 0
4525	VCN highest_vcn, stop_vcn;
4526	ntfs_volume *vol;
4527	MFT_RECORD *base_m, *m;
4528	ATTR_RECORD *a;
4529	ntfs_rl_element *rl;
4530	ntfs_inode *eni;
4531	ATTR_LIST_ENTRY *al_entry;
4532	unsigned name_size, mp_ofs, mp_size, al_entry_len, new_al_size;
4533	unsigned new_al_alloc;
4534	errno_t err;
4535	BOOL rewrite;
4536#endif
4537
4538	ntfs_debug("Entering for mft_no 0x%llx, type 0x%x, name_len 0x%x.",
4539			(unsigned long long)base_ni->mft_no,
4540			(unsigned)le32_to_cpu(ni->type), ni->name_len);
4541	return ENOTSUP;
4542#if 0
4543	vol = base_ni->vol;
4544	base_m = base_ni->m;
4545	m = ctx->m;
4546	a = ctx->a;
4547	rewrite = FALSE;
4548	/*
4549	 * We should only be called for non-sparse, non-resident, $DATA
4550	 * attributes.
4551	 */
4552	if (a->type != AT_DATA || !NInoNonResident(ni) || !a->non_resident ||
4553			NInoSparse(ni) || a->flags & ATTR_IS_SPARSE)
4554		panic("%s(): a->type != AT_DATA || !NInoNonResident(ni) || "
4555				"!a->non_resident || NInoSparse(ni) || "
4556				"a->flags & ATTR_IS_SPARSE\n", __FUNCTION__);
4557	/*
4558	 * If the attribute is not compressed either, we need to add the
4559	 * compressed size to the attribute record and to switch all relevant
4560	 * fields to match.
4561	 */
4562	if (NInoCompressed(ni))
4563		goto is_compressed;
4564	if (a->flags & ATTR_IS_COMPRESSED)
4565		panic("%s(): a->flags & ATTR_IS_COMPRESSED)\n", __FUNCTION__);
4566retry_attr_rec_resize:
4567	err = ntfs_attr_record_resize(m, a, le32_to_cpu(a->length) +
4568			sizeof(a->compressed_size));
4569	if (!err) {
4570		/*
4571		 * Move everything at the offset of the compressed size to make
4572		 * space for the compressed size.
4573		 */
4574		memmove((u8*)a + offsetof(ATTR_RECORD, compressed_size) +
4575				sizeof(a->compressed_size), (u8*)a +
4576				offsetof(ATTR_RECORD, compressed_size),
4577				le32_to_cpu(a->length) - offsetof(ATTR_RECORD,
4578				compressed_size));
4579		/*
4580		 * Update the name offset to match the moved data.  If there is
4581		 * no name then set the name offset to the correct position
4582		 * instead of adding to a potentially incorrect value.
4583		 */
4584		if (a->name_length)
4585			a->name_offset = cpu_to_le16(
4586					le16_to_cpu(a->name_offset) +
4587					sizeof(a->compressed_size));
4588		else
4589			a->name_offset = const_cpu_to_le16(
4590					offsetof(ATTR_RECORD,
4591					compressed_size) +
4592					sizeof(a->compressed_size));
4593		/* Update the mapping pairs offset to its new location. */
4594		mp_ofs = le16_to_cpu(a->mapping_pairs_offset) +
4595				sizeof(a->compressed_size);
4596		goto set_compressed_size;
4597	}
4598	/*
4599	 * There is not enough space in the mft record.
4600	 *
4601	 * We need to add an attribute list attribute if it is not already
4602	 * present.
4603	 */
4604	if (!NInoAttrList(base_ni)) {
4605		err = ntfs_attr_list_add(base_ni, base_m, ctx);
4606		if (err || ctx->is_error) {
4607			if (!err)
4608				err = ctx->error;
4609			ntfs_error(vol->mp, "Failed to %s mft_no 0x%llx "
4610					"(error %d).", ctx->is_error ?
4611					"remap extent mft record of" :
4612					"add attribute list attribute to",
4613					(unsigned long long)base_ni->mft_no,
4614					err);
4615			return err;
4616		}
4617		/*
4618		 * The attribute location will have changed so update it from
4619		 * the search context.
4620		 */
4621		m = ctx->m;
4622		a = ctx->a;
4623		/*
4624		 * Retry the original attribute record resize as we may now
4625		 * have enough space to add the compressed size to the
4626		 * attribute record.
4627		 *
4628		 * This can for example happen when the attribute was moved out
4629		 * to an extent mft record which has much more free space than
4630		 * the base mft record had.
4631		 */
4632		goto retry_attr_rec_resize;
4633	}
4634	/*
4635	 * If this is not the only attribute record in the mft record then move
4636	 * it out to a new extent mft record which is guaranteed to generate
4637	 * enough space to add the compressed size to the attribute record.
4638	 */
4639	if (!ntfs_attr_record_is_only_one(m, a)) {
4640		lck_rw_lock_shared(&base_ni->attr_list_rl.lock);
4641		err = ntfs_attr_record_move(ctx);
4642		lck_rw_unlock_shared(&base_ni->attr_list_rl.lock);
4643		if (err) {
4644			ntfs_error(vol->mp, "Failed to move attribute extent "
4645					"from mft record 0x%llx to an extent "
4646					"mft record (error %d).",
4647					(unsigned long long)ctx->ni->mft_no,
4648					err);
4649			/*
4650			 * We could try to remove the attribute list attribute
4651			 * if we added it above but this will require
4652			 * attributes to be moved back into the base mft record
4653			 * from extent mft records so is a lot of work and
4654			 * given we are in an error code path and given that it
4655			 * is ok to just leave the inode with an attribute list
4656			 * attribute we do not bother and just bail out.
4657			 */
4658			return err;
4659		}
4660		/*
4661		 * The attribute location will have changed so update it from
4662		 * the search context.
4663		 */
4664		m = ctx->m;
4665		a = ctx->a;
4666		/*
4667		 * Retry the original attribute record resize as we will now
4668		 * have enough space to add the compressed size to the
4669		 * attribute record.
4670		 */
4671		goto retry_attr_rec_resize;
4672	}
4673	/*
4674	 * This is the only attribute in the mft record thus there is nothing
4675	 * to gain by moving it to another extent mft record.  So to generate
4676	 * space, we allocate a new extent mft record, create a new extent
4677	 * attribute record in it and use it to catch the overflow mapping
4678	 * pairs array data generated by the fact that we have added the
4679	 * compressed size to the base extent.
4680	 *
4681	 * TODO: We could instead iterate over all existing extent attribute
4682	 * records and rewrite the entire mapping pairs array but this could
4683	 * potentially be a lot of overhead.  On the other hand it would be an
4684	 * infrequent event thus the overhead may be worth it in the long term
4685	 * as it will generate better packed metadata.  For now we choose the
4686	 * simpler approach of just doing the splitting into a new extent
4687	 * attribute record.
4688	 *
4689	 * As we are going to rewrite the mapping pairs array we need to make
4690	 * sure we have decompressed the mapping pairs from the base attribute
4691	 * extent and have them cached in the runlist.
4692	 */
4693	if (!ni->rl.elements || ni->rl.rl->lcn == LCN_RL_NOT_MAPPED) {
4694		err = ntfs_mapping_pairs_decompress(vol, a, &ni->rl);
4695		if (err) {
4696			ntfs_error(vol->mp, "Mapping of the base runlist "
4697					"fragment failed (error %d).", err);
4698			if (err != ENOMEM)
4699				err = EIO;
4700			return err;
4701		}
4702	}
4703	rewrite = TRUE;
4704	/*
4705	 * Now add the compressed size so we can unmap the mft record of the
4706	 * base attribute extent if it is an extent mft record.
4707	 *
4708	 * First, move the name if present to its new location and update the
4709	 * name offset to match the new location.
4710	 */
4711	name_size = a->name_length * sizeof(ntfschar);
4712	if (name_size)
4713		memmove((u8*)a + offsetof(ATTR_RECORD, compressed_size) +
4714				sizeof(a->compressed_size), (u8*)a +
4715				le16_to_cpu(a->name_offset), name_size);
4716	a->name_offset = const_cpu_to_le16(offsetof(ATTR_RECORD,
4717			compressed_size) + sizeof(a->compressed_size));
4718	/* Update the mapping pairs offset to its new location. */
4719	mp_ofs = (offsetof(ATTR_RECORD, compressed_size) +
4720			sizeof(a->compressed_size) + name_size + 7) & ~7;
4721set_compressed_size:
4722	a->mapping_pairs_offset = cpu_to_le16(mp_ofs);
4723	/*
4724	 * Set the compression unit to 0 or 4 depending on the NTFS volume
4725	 * version.  FIXME: We know that NT4 uses 4 whilst XPSP2 uses 0 and we
4726	 * do not know what 2k uses so we assume 2k is the same as XPSP2.
4727	 */
4728	if (vol->major_ver > 1) {
4729		a->compression_unit = 0;
4730		ni->compression_block_size = 0;
4731		ni->compression_block_clusters =
4732				ni->compression_block_size_shift = 0;
4733	} else {
4734		a->compression_unit = NTFS_COMPRESSION_UNIT;
4735		ni->compression_block_size = 1U << (NTFS_COMPRESSION_UNIT +
4736				vol->cluster_size_shift);
4737		ni->compression_block_size_shift =
4738				ffs(ni->compression_block_size) - 1;
4739		ni->compression_block_clusters = 1U << NTFS_COMPRESSION_UNIT;
4740	}
4741	lck_spin_lock(&ni->size_lock);
4742	ni->compressed_size = ni->allocated_size;
4743	a->compressed_size = a->allocated_size;
4744	lck_spin_unlock(&ni->size_lock);
4745is_compressed:
4746	/* Mark both the attribute and the ntfs inode as sparse. */
4747	a->flags |= ATTR_IS_SPARSE;
4748	NInoSetSparse(ni);
4749	/*
4750	 * If this is the unnamed $DATA attribute, need to set the sparse flag
4751	 * in the standard information attribute and in the directory entries,
4752	 * too.
4753	 */
4754	if (ni == base_ni) {
4755		ni->file_attributes |= FILE_ATTR_SPARSE_FILE;
4756		NInoSetDirtyFileAttributes(ni);
4757	}
4758	/* If we do not need to rewrite the mapping pairs array we are done. */
4759	if (!rewrite)
4760		goto done;
4761	/*
4762	 * Determine the size of the mapping pairs array needed to fit all the
4763	 * runlist elements that were stored in the base attribute extent
4764	 * before we added the compressed size to the attribute record.
4765	 */
4766	highest_vcn = sle64_to_cpu(a->highest_vcn);
4767	err = ntfs_get_size_for_mapping_pairs(vol, ni->rl.elements ?
4768			ni->rl.rl : NULL, 0, highest_vcn, &mp_size);
4769	if (err) {
4770		ntfs_error(vol->mp, "Failed to get size for mapping pairs "
4771				"array (error %d).", err);
4772		goto undo1;
4773	}
4774	/* Write the mapping pairs array. */
4775	err = ntfs_mapping_pairs_build(vol, (s8*)a + mp_ofs,
4776			le32_to_cpu(a->length) - mp_ofs, ni->rl.elements ?
4777			ni->rl.rl : NULL, 0, highest_vcn, &stop_vcn);
4778	if (err && err != ENOSPC) {
4779		ntfs_error(vol->mp, "Failed to rebuild mapping pairs array "
4780				"(error %d).", err);
4781		goto undo1;
4782	}
4783	/* If by some miracle it all fitted we are done. */
4784	if (!err)
4785		goto done;
4786	/* Update the highest vcn to the new value. */
4787	a->highest_vcn = cpu_to_sle64(stop_vcn - 1);
4788	/*
4789	 * If the base attribute extent is in an extent mft record mark it
4790	 * dirty so it gets written back and unmap the extent mft record so we
4791	 * can allocate the new extent mft record.
4792	 */
4793	if (ctx->ni != base_ni) {
4794		NInoSetMrecNeedsDirtying(ctx->ni);
4795		ntfs_extent_mft_record_unmap(ctx->ni);
4796		/* Make the search context safe. */
4797		ctx->ni = base_ni;
4798	}
4799	/*
4800	 * Get the runlist element containing the lowest vcn for the new
4801	 * attribute record, i.e. @stop_vcn.
4802	 *
4803	 * This cannot fail as we know the runlist is ok and the runlist
4804	 * fragment containing @stop_vcn is mapped.
4805	 */
4806	rl = NULL;
4807	if (ni->rl.elements) {
4808		rl = ntfs_rl_find_vcn_nolock(ni->rl.rl, stop_vcn);
4809		if (!rl)
4810			panic("%s(): Memory corruption detected.\n",
4811					__FUNCTION__);
4812	}
4813	/*
4814	 * Determine the size of the mapping pairs array needed to fit all the
4815	 * remaining runlist elements that were stored in the base attribute
4816	 * extent before we added the compressed size to the attribute record
4817	 * but did now not fit.
4818	 */
4819	err = ntfs_get_size_for_mapping_pairs(vol, rl, stop_vcn, highest_vcn,
4820			&mp_size);
4821	if (err) {
4822		ntfs_error(vol->mp, "Failed to get size for mapping pairs "
4823				"array (error %d).", err);
4824		goto undo2;
4825	}
4826	/*
4827	 * We now need to allocate a new extent mft record, attach it to the
4828	 * base ntfs inode and set up the search context to point to it, then
4829	 * insert the new attribute record into it.
4830	 */
4831	err = ntfs_mft_record_alloc(vol, NULL, NULL, ni, &eni, &m, &a);
4832	if (err) {
4833		ntfs_error(vol->mp, "Failed to allocate a new extent mft "
4834				"record (error %d).", err);
4835		goto undo2;
4836	}
4837	ctx->ni = eni;
4838	ctx->m = m;
4839	ctx->a = a;
4840	/*
4841	 * Calculate the offset into the new attribute at which the mapping
4842	 * pairs array begins.  The mapping pairs array is placed after the
4843	 * name aligned to an 8-byte boundary which in turn is placed
4844	 * immediately after the non-resident attribute record itself.
4845	 *
4846	 * Note that extent attribute records do not have the compressed size
4847	 * field in their attribute records.
4848	 */
4849	mp_ofs = (offsetof(ATTR_RECORD, compressed_size) + name_size + 7) & ~7;
4850	/*
4851	 * Make space for the new attribute extent.  This cannot fail as we now
4852	 * have an empty mft record which by definition can hold a non-resident
4853	 * attribute record with just a small mapping pairs array.
4854	 */
4855	err = ntfs_attr_record_make_space(m, a, mp_ofs + mp_size);
4856	if (err)
4857		panic("%s(): err (ntfs_attr_record_make_space())\n",
4858				__FUNCTION__);
4859	/*
4860	 * Now setup the new attribute record.  The entire attribute has been
4861	 * zeroed and the length of the attribute record has been set.
4862	 *
4863	 * Before we proceed with setting up the attribute, add an attribute
4864	 * list attribute entry for the created attribute extent.
4865	 */
4866	al_entry = ctx->al_entry = (ATTR_LIST_ENTRY*)((u8*)ctx->al_entry +
4867			le16_to_cpu(ctx->al_entry->length));
4868	al_entry_len = (offsetof(ATTR_LIST_ENTRY, name) + name_size + 7) & ~7;
4869	new_al_size = base_ni->attr_list_size + al_entry_len;
4870	/* Out of bounds checks. */
4871	if ((u8*)al_entry < base_ni->attr_list || (u8*)al_entry >
4872			base_ni->attr_list + new_al_size || (u8*)al_entry +
4873			al_entry_len > base_ni->attr_list + new_al_size) {
4874		/* Inode is corrupt. */
4875		ntfs_error(vol->mp, "Inode 0x%llx is corrupt.  Run chkdsk.",
4876				(unsigned long long)base_ni->mft_no);
4877		err = EIO;
4878		goto undo3;
4879	}
4880	err = ntfs_attr_size_bounds_check(vol, AT_ATTRIBUTE_LIST, new_al_size);
4881	if (err) {
4882		if (err == ERANGE) {
4883			ntfs_error(vol->mp, "Attribute list attribute would "
4884					"become to large.  You need to "
4885					"defragment your volume and then try "
4886					"again.");
4887			err = ENOSPC;
4888		} else {
4889			ntfs_error(vol->mp, "Attribute list attribute is "
4890					"unknown on the volume.  The volume "
4891					"is corrupt.  Run chkdsk.");
4892			NVolSetErrors(vol);
4893			err = EIO;
4894		}
4895		goto undo3;
4896	}
4897	/*
4898	 * Reallocate the memory buffer if needed and create space for the new
4899	 * entry.
4900	 */
4901	new_al_alloc = (new_al_size + NTFS_ALLOC_BLOCK - 1) &
4902			~(NTFS_ALLOC_BLOCK - 1);
4903	if (new_al_alloc > base_ni->attr_list_alloc) {
4904		u8 *tmp, *al, *al_end;
4905		unsigned al_entry_ofs;
4906
4907		tmp = OSMalloc(new_al_alloc, ntfs_malloc_tag);
4908		if (!tmp) {
4909			ntfs_error(vol->mp, "Not enough memory to extend the "
4910					"attribute list attribute.");
4911			err = ENOMEM;
4912			goto undo3;
4913		}
4914		al = base_ni->attr_list;
4915		al_entry_ofs = (u8*)al_entry - al;
4916		al_end = al + base_ni->attr_list_size;
4917		memcpy(tmp, al, al_entry_ofs);
4918		if ((u8*)al_entry < al_end)
4919			memcpy(tmp + al_entry_ofs + al_entry_len, al +
4920					al_entry_ofs, base_ni->attr_list_size -
4921					al_entry_ofs);
4922		al_entry = ctx->al_entry = (ATTR_LIST_ENTRY*)(tmp +
4923				al_entry_ofs);
4924		OSFree(base_ni->attr_list, base_ni->attr_list_alloc,
4925				ntfs_malloc_tag);
4926		base_ni->attr_list_alloc = new_al_alloc;
4927		base_ni->attr_list = tmp;
4928	} else if ((u8*)al_entry < base_ni->attr_list +
4929			base_ni->attr_list_size)
4930		memmove((u8*)al_entry + al_entry_len, al_entry,
4931				base_ni->attr_list_size - ((u8*)al_entry -
4932				base_ni->attr_list));
4933	base_ni->attr_list_size = new_al_size;
4934	/* Set up the attribute extent and the attribute list entry. */
4935	al_entry->type = a->type = ni->type;
4936	al_entry->length = cpu_to_le16(al_entry_len);
4937	a->non_resident = 1;
4938	al_entry->name_length = a->name_length = ni->name_len;
4939	a->name_offset = const_cpu_to_le16(offsetof(ATTR_RECORD,
4940			compressed_size));
4941	al_entry->name_offset = offsetof(ATTR_LIST_ENTRY, name);
4942	al_entry->instance = a->instance = m->next_attr_instance;
4943	/*
4944	 * Increment the next attribute instance number in the mft record as we
4945	 * consumed the old one.
4946	 */
4947	m->next_attr_instance = cpu_to_le16(
4948			(le16_to_cpu(m->next_attr_instance) + 1) & 0xffff);
4949	al_entry->lowest_vcn = a->lowest_vcn = cpu_to_sle64(stop_vcn);
4950	a->highest_vcn = cpu_to_sle64(highest_vcn);
4951	al_entry->mft_reference = MK_LE_MREF(eni->mft_no, eni->seq_no);
4952	a->mapping_pairs_offset = cpu_to_le16(mp_ofs);
4953	/* Copy the attribute name into place. */
4954	if (name_size) {
4955		memcpy((u8*)a + offsetof(ATTR_RECORD, compressed_size),
4956				ni->name, name_size);
4957		memcpy(&al_entry->name, ni->name, name_size);
4958	}
4959	/* For tidyness, zero out the unused space. */
4960	if (al_entry_len > offsetof(ATTR_LIST_ENTRY, name) + name_size)
4961		memset((u8*)al_entry + offsetof(ATTR_LIST_ENTRY, name) +
4962				name_size, 0, al_entry_len -
4963				(offsetof(ATTR_LIST_ENTRY, name) + name_size));
4964	/*
4965	 * Extend the attribute list attribute and copy in the modified value
4966	 * from the cache.
4967	 */
4968	err = ntfs_attr_list_sync_extend(base_ni, base_m,
4969			(u8*)al_entry - base_ni->attr_list, ctx);
4970	if (err || ctx->is_error) {
4971		/*
4972		 * If @ctx->is_error indicates error this is fatal as we cannot
4973		 * build the mapping pairs array into it as it is not mapped.
4974		 *
4975		 * However, we may still be able to recover from this situation
4976		 * by freeing the extent mft record and thus deleting the
4977		 * attribute record.  This only works when this is the only
4978		 * attribute record in the mft record and when we just created
4979		 * this extent attribute record.  We can easily determine if
4980		 * this is the only attribute in the mft record by scanning
4981		 * through the cached attribute list attribute.
4982		 */
4983		if (!err)
4984			err = ctx->error;
4985		ntfs_error(vol->mp, "Failed to %s mft_no 0x%llx (error %d).",
4986				ctx->is_error ?  "remap extent mft record of" :
4987				"extend and sync attribute list attribute to",
4988				(unsigned long long)base_ni->mft_no, err);
4989		goto undo4;
4990	}
4991	/*
4992	 * Finally, proceed to building the mapping pairs array into the
4993	 * attribute record.
4994	 */
4995	err = ntfs_mapping_pairs_build(vol, (s8*)a + mp_ofs,
4996			le32_to_cpu(a->length) - mp_ofs, rl, stop_vcn,
4997			highest_vcn, &stop_vcn);
4998	if (err && err != ENOSPC) {
4999		ntfs_error(vol->mp, "Failed to rebuild mapping pairs array "
5000				"(error %d).", err);
5001		goto undo5;
5002	}
5003	/*
5004	 * We must have fully rebuilt the mapping pairs array as we made sure
5005	 * there is enough space.
5006	 */
5007	if (err || stop_vcn != highest_vcn + 1)
5008		panic("%s(): err || stop_vcn != highest_vcn + 1\n",
5009				__FUNCTION__);
5010	/*
5011	 * If the attribute extent is in an extent mft record mark it dirty so
5012	 * it gets written back and unmap the extent mft record so we can map
5013	 * the mft record containing the base extent again.
5014	 */
5015	if (eni != base_ni) {
5016		NInoSetMrecNeedsDirtying(eni);
5017		ntfs_extent_mft_record_unmap(eni);
5018		/* Make the search context safe. */
5019		ctx->ni = base_ni;
5020	}
5021	/*
5022	 * Look up the base attribute extent again so we restore the search
5023	 * context as the caller expects it to be.
5024	 */
5025	ntfs_attr_search_ctx_reinit(ctx);
5026	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 0, NULL, 0,
5027			ctx);
5028	if (err) {
5029		ntfs_error(vol->mp, "Re-lookup of first attribute extent "
5030				"failed (error %d).", err);
5031		if (err == ENOENT)
5032			err = EIO;
5033		goto undo6;
5034	}
5035done:
5036	ntfs_debug("Done.");
5037	return 0;
5038// TODO: HERE:
5039undo6:
5040undo5:
5041undo4:
5042undo3:
5043undo2:
5044undo1:
5045	panic("%s(): TODO!\n", __FUNCTION__);
5046	return err;
5047#endif
5048}
5049
5050/**
5051 * ntfs_attr_sparse_clear - switch an attribute to not be sparse any more
5052 * @base_ni:	base ntfs inode to which the attribute belongs
5053 * @ni:		ntfs inode of attribute which to cause not to be sparse
5054 * @ctx:	attribute search context describing the attribute to work on
5055 *
5056 * Switch the sparse attribute described by @ni and @ctx belonging to the base
5057 * ntfs inode @base_ni to not be sparse any more.
5058 *
5059 * This function cannot fail.
5060 */
5061static void ntfs_attr_sparse_clear(ntfs_inode *base_ni, ntfs_inode *ni,
5062		ntfs_attr_search_ctx *ctx)
5063{
5064	ATTR_RECORD *a;
5065
5066	a = ctx->a;
5067	/*
5068	 * We should only be called for sparse, non-resident, $DATA attributes.
5069	 */
5070	if (a->type != AT_DATA || !NInoNonResident(ni) || !a->non_resident ||
5071			!NInoSparse(ni) || !(a->flags & ATTR_IS_SPARSE))
5072		panic("%s(): a->type != AT_DATA || !NInoNonResident(ni) || "
5073				"!a->non_resident || !NInoSparse(ni) || "
5074				"!(a->flags & ATTR_IS_SPARSE)\n", __FUNCTION__);
5075	/*
5076	 * If the attribute is not compressed we need to remove the compressed
5077	 * size from the attribute record and to switch all relevant fields to
5078	 * match.
5079	 */
5080	if (!NInoCompressed(ni)) {
5081		errno_t err;
5082
5083		if (a->flags & ATTR_IS_COMPRESSED)
5084			panic("%s(): a->flags & ATTR_IS_COMPRESSED)\n",
5085					__FUNCTION__);
5086		/*
5087		 * Move everything after the compressed size forward to the
5088		 * offset of the compressed size thus deleting the compressed
5089		 * size.
5090		 */
5091		memmove((u8*)a + offsetof(ATTR_RECORD, compressed_size),
5092				(u8*)a + offsetof(ATTR_RECORD,
5093				compressed_size) + sizeof(a->compressed_size),
5094				le32_to_cpu(a->length) - (offsetof(ATTR_RECORD,
5095				compressed_size) + sizeof(a->compressed_size)));
5096		/*
5097		 * Update the name offset and the mapping pairs offset to match
5098		 * the moved data.  If there is no name then set the name
5099		 * offset to the correct position instead of subtracting from a
5100		 * potentially incorrect value.
5101		 */
5102		if (!a->name_length)
5103			a->name_offset = const_cpu_to_le16(offsetof(ATTR_RECORD,
5104					compressed_size));
5105		else
5106			a->name_offset = cpu_to_le16(
5107					le16_to_cpu(a->name_offset) -
5108					sizeof(a->compressed_size));
5109		a->mapping_pairs_offset = cpu_to_le16(
5110				le16_to_cpu(a->mapping_pairs_offset) -
5111				sizeof(a->compressed_size));
5112		/* Set the compression unit to 0. */
5113		a->compression_unit = 0;
5114		lck_spin_lock(&ni->size_lock);
5115		ni->compressed_size = 0;
5116		lck_spin_unlock(&ni->size_lock);
5117		/* Clear the other related fields. */
5118		ni->compression_block_size = 0;
5119		ni->compression_block_clusters =
5120				ni->compression_block_size_shift = 0;
5121		/*
5122		 * Finally shrink the attribute record to reflect the removal
5123		 * of the compressed size.  Note, this cannot fail since we are
5124		 * making the attribute smaller thus by definition there is
5125		 * enough space to do so.
5126		 */
5127		err = ntfs_attr_record_resize(ctx->m, a,
5128				le32_to_cpu(a->length) -
5129				sizeof(a->compressed_size));
5130		if (err)
5131			panic("%s(): err\n", __FUNCTION__);
5132	}
5133	/* Mark both the attribute and the ntfs inode as non-sparse. */
5134	a->flags &= ~ATTR_IS_SPARSE;
5135	NInoClearSparse(ni);
5136	/*
5137	 * If this is the unnamed $DATA attribute, need to clear the sparse
5138	 * flag in the standard information attribute and in the directory
5139	 * entries, too.
5140	 */
5141	if (ni == base_ni) {
5142		ni->file_attributes &= ~FILE_ATTR_SPARSE_FILE;
5143		NInoSetDirtyFileAttributes(ni);
5144	}
5145}
5146
5147/**
5148 * ntfs_attr_instantiate_holes - instantiate the holes in an attribute region
5149 * @ni:		ntfs inode of the attribute whose holes to instantiate
5150 * @start:	start offset in bytes at which to begin instantiating holes
5151 * @end:	end offset in bytes at which to stop instantiating holes
5152 * @new_end:	return the offset at which we stopped instantiating holes
5153 * @atomic:	if true must complete the entire exension or abort
5154 *
5155 * Scan the runlist (mapping any unmapped fragments as needed) starting at byte
5156 * offset @start into the attribute described by the ntfs inode @ni and
5157 * finishing at byte offset @end and instantiate any sparse regions located
5158 * between @start and @end with real clusters.
5159 *
5160 * Any clusters that are inside the initialized size are zeroed.
5161 *
5162 * If @atomic is true the whole instantiation must be complete so abort on
5163 * errors.  If @atomic is false partial instantiations are acceptable (but we
5164 * still return an error if the instantiation is partial).  In any case we set
5165 * *@new_end to the end of the instantiated range.  Thus the caller has to
5166 * always check *@new_end.  If *@new_end is equal to @end then the whole
5167 * instantiation was complete.  If *@new_end is less than @end the
5168 * instantiation was partial.
5169 *
5170 * Note if @new_end is NULL, then @atomic is set to true as there is no way to
5171 * communicate to the caller that the hole instantiation was partial.
5172 *
5173 * Return 0 on success and errno on error.
5174 *
5175 * Locking: - Caller must hold @ni->lock on the inode for writing.
5176 *	    - The runlist @ni must be unlocked as it is taken for writing.
5177 */
5178errno_t ntfs_attr_instantiate_holes(ntfs_inode *ni, s64 start, s64 end,
5179		s64 *new_end, BOOL atomic)
5180{
5181#if 0
5182	VCN vcn, end_vcn;
5183	s64 allocated_size, initialized_size, compressed_size, len;
5184	ntfs_inode *base_ni;
5185	ntfs_volume *vol = ni->vol;
5186	ntfs_rl_element *rl;
5187	MFT_RECORD *base_m, *m;
5188	ntfs_attr_search_ctx *ctx;
5189	ATTR_RECORD *a;
5190	errno_t err, err2;
5191	BOOL write_locked;
5192	ntfs_runlist runlist;
5193#else
5194	ntfs_volume *vol = ni->vol;
5195	errno_t err;
5196#endif
5197
5198	err = 0;
5199	/* We should never be called for non-sparse attributes. */
5200	if (!NInoSparse(ni))
5201		panic("%s(): !NInoSparse(ni)\n", __FUNCTION__);
5202	/* We should never be called for resident attributes. */
5203	if (!NInoNonResident(ni))
5204		panic("%s(): !NInoNonResident(ni)\n", __FUNCTION__);
5205	/* We should only be called for $DATA attributes. */
5206	if (ni->type != AT_DATA)
5207		panic("%s(): ni->type != AT_DATA\n", __FUNCTION__);
5208	/* Sanity check @start and @end. */
5209	if (start >= end)
5210		panic("%s(): start >= end\n", __FUNCTION__);
5211	if (start & vol->cluster_size_mask || end & vol->cluster_size_mask)
5212		panic("%s(): start & vol->cluster_size_mask || "
5213				"end & vol->cluster_size_mask\n", __FUNCTION__);
5214	err = ENOTSUP;
5215	return err;
5216#if 0
5217	base_ni = ni;
5218	if (NInoAttr(ni))
5219		base_ni = ni->base_ni;
5220	if (!new_end)
5221		atomic = TRUE;
5222	lck_rw_lock_shared(&ni->rl.lock);
5223	write_locked = FALSE;
5224	/*
5225	 * We have to round down @start to the nearest page boundary and we
5226	 * have to round up @end to the nearest page boundary for the cases
5227	 * where the cluster size is smaller than the page size.  It makes no
5228	 * sense to instantiate only part of a page as a later pageout of the
5229	 * dirty page would cause any sparse clusters inside the page to be
5230	 * instantiated so we might as well do it now whilst we are
5231	 * instantiating things.
5232	 */
5233	vcn = (start & ~PAGE_MASK_64) >> vol->cluster_size_shift;
5234	end_vcn = ((end + PAGE_MASK) & ~PAGE_MASK_64) >>
5235			vol->cluster_size_shift;
5236	/* Cache the sizes for the attribute so we take the size lock once. */
5237	lck_spin_lock(&ni->size_lock);
5238	allocated_size = ni->allocated_size;
5239	initialized_size = ni->initialized_size;
5240	compressed_size = ni->compressed_size;
5241	lck_spin_unlock(&ni->size_lock);
5242	/*
5243	 * We have to make sure that we stay within the existing allocated
5244	 * size when instantiating holes as it would corrupt the attribute if
5245	 * we were to extend the runlist beyond the allocated size.  And our
5246	 * rounding up of @end above could have caused us to go above the
5247	 * allocated size so fix this up now.
5248	 */
5249	if (end_vcn > allocated_size >> vol->cluster_size_shift)
5250		end_vcn = allocated_size >> vol->cluster_size_shift;
5251retry_remap:
5252	rl = ni->rl.rl;
5253	if (!ni->rl.elements || vcn < rl->vcn || !rl->length) {
5254map_vcn:
5255		if (!write_locked) {
5256			write_locked = TRUE;
5257			if (!lck_rw_lock_shared_to_exclusive(&ni->rl.lock)) {
5258				lck_rw_lock_exclusive(&ni->rl.lock);
5259				goto retry_remap;
5260			}
5261		}
5262		/* Need to map the runlist fragment containing @vcn. */
5263		err = ntfs_map_runlist_nolock(ni, vcn, NULL);
5264		if (err) {
5265			ntfs_error(vol->mp, "Failed to map runlist fragment "
5266					"(error %d).", err);
5267			if (err == EINVAL)
5268				err = EIO;
5269			goto err;
5270		}
5271		rl = ni->rl.rl;
5272		if (!ni->rl.elements || vcn < rl->vcn || !rl->length)
5273			panic("%s(): !ni->rl.elements || vcn < rl[0].vcn || "
5274					"!rl->length\n", __FUNCTION__);
5275	}
5276	do {
5277		VCN lowest_vcn, highest_vcn, stop_vcn;
5278		ntfs_rl_element *rl2;
5279		unsigned mp_size, mp_ofs;
5280
5281		/* Seek to the runlist element containing @vcn. */
5282		while (rl->length && vcn >= rl[1].vcn)
5283			rl++;
5284		/*
5285		 * Seek to the first sparse run or to the end of the region we
5286		 * are interested in.
5287		 */
5288		while (rl->length && rl->lcn >= 0 && vcn < end_vcn) {
5289			rl++;
5290			vcn = rl->vcn;
5291		}
5292		/*
5293		 * If there are no sparse runs (left) in the region of interest
5294		 * we are done.
5295		 */
5296		if (vcn >= end_vcn) {
5297			vcn = end_vcn;
5298			break;
5299		}
5300		/*
5301		 * If this run is not mapped map it now and start again as the
5302		 * runlist will have been updated.
5303		 */
5304		if (rl->lcn == LCN_RL_NOT_MAPPED)
5305			goto map_vcn;
5306		/* If this run is not valid abort with an error. */
5307		if (!rl->length || rl->lcn < LCN_HOLE) {
5308			ntfs_error(vol->mp, "Runlist is corrupt.  Unmount and "
5309					"run chkdsk.");
5310			NVolSetErrors(vol);
5311			err = EIO;
5312			goto err;
5313		}
5314		/*
5315		 * This run is sparse thus we need to instantiate it for which
5316		 * we need to hold the runlist lock for writing.
5317		 */
5318		if (!write_locked) {
5319			write_locked = TRUE;
5320			if (!lck_rw_lock_shared_to_exclusive(&ni->rl.lock)) {
5321				lck_rw_lock_exclusive(&ni->rl.lock);
5322				goto retry_remap;
5323			}
5324		}
5325		/*
5326		 * Make sure that we do not instantiate past @end_vcn as would
5327		 * otherwise happen when the hole goes past @end_vcn.
5328		 */
5329		len = rl[1].vcn - vcn;
5330		if (rl[1].vcn > end_vcn)
5331			len = end_vcn - vcn;
5332// TODO: HERE:
5333		/*
5334		 * If the entire run lies outside the initialized size we do
5335		 * not need to do anything other than instantiating the hole
5336		 * with real clusters.
5337		 *
5338		 * If part of the run (or the whole run) lies inside the
5339		 * initialized size we need to zero the clusters in memory and
5340		 * mark the pages dirty so they get written out later in
5341		 * addition to instantiating the hole with real clusters.
5342		 *
5343		 * The need for zeroing causes two potential problems.  The
5344		 * first problem is that if the run being instantiated is very
5345		 * large we could run out of memory due to us holding both the
5346		 * inode lock and the runlist lock for writing so all the dirty
5347		 * pages we create/release back to the VM cannot be paged out
5348		 * until we release the locks and the second problem is that if
5349		 * the cluster size is less than the page size we can encounter
5350		 * partially sparse pages and if they are not already cached by
5351		 * the VM we have to page them in.  But to do so we have to not
5352		 * hold the runlist lock for writing.  We have two ways out of
5353		 * this situation.  Either we have to drop and re-acquire the
5354		 * runlist lock around paging in such pages (with restarting
5355		 * everything each time because we had dropped the lock) or we
5356		 * have to read the non-sparse clusters in by hand using an
5357		 * enhanced ntfs_rl_read() or even by calling buf_meta_bread()
5358		 * directly.
5359		 *
5360		 * FIXME: We ignore the first problem for now until the code is
5361		 * working and we can test it.  The solution is probably to
5362		 * break the work into chunks of a fixed size and the allocate
5363		 * only enough clusters to complete the current chunk then
5364		 * merge that with the runlist, dirty all corresponding pages,
5365		 * then drop the locks to allow the pages to be written if
5366		 * needed and then take the locks again and start again with
5367		 * the next chunk.  This does have one nasty side effect and
5368		 * that is that whilst the locks are dropped a concurrent
5369		 * process could do nasty things to the inode including
5370		 * truncate our carefully allocated pages by shrinking the file
5371		 * so a lot of sanity checking after re-taking the locks will
5372		 * be needed.  Alternatively perhaps we need to hold the inode
5373		 * lock shared throughout this function so dropping the
5374		 * runlist lock would be sufficient.  We do not actually need
5375		 * the inode lock for writing in this function as we do not
5376		 * modify any of the inode sizes and the runlist lock will
5377		 * protect us sufficiently from everything.
5378		 *
5379		 * FIXME: We also ignore the second problem for now and abort
5380		 * if it bites us, again until the code is working and we can
5381		 * test it.
5382		 */
5383		/*
5384		 * Seek back to the last real LCN so we can try and extend the
5385		 * hole at that LCN so the instantiated clusters are at least
5386		 * in close proximity to the other data in the attribute.
5387		 */
5388		rl2 = rl;
5389		while (rl2->lcn < 0 && rl2 > ni->rl.rl)
5390			rl2--;
5391		runlist.rl = NULL;
5392		runlist.alloc = runlist.elements = 0;
5393		err = ntfs_cluster_alloc(vol, vcn, len,
5394				(rl2->lcn >= 0) ? rl2->lcn + rl2->length : -1,
5395				DATA_ZONE, FALSE, &runlist);
5396		if (err) {
5397			if (err != ENOSPC)
5398				ntfs_error(vol->mp, "Failed to allocate "
5399						"clusters (error %d).", err);
5400			goto err;
5401		}
5402// TODO: HERE:
5403		/*
5404		 * If the instantiated hole starts before the initialized size
5405		 * we need to zero it.
5406		 *
5407		 * FIXME: For now we do it in the most stupid way possible and
5408		 * that is to synchronously write zeroes to disk via the device
5409		 * hosting the volume.  That way we get around our issues and
5410		 * problems with the UBC and small/large cluster sizes.  This
5411		 * way if there is dirty data in the UBC it will still get
5412		 * written on top of the zeroing we are now doing.  Ordering is
5413		 * guaranteed as no-one knows about the allocated clusters yet
5414		 * as we have not merged the runlists yet.
5415		 *
5416		 * FIXME: TODO: It may be worth restricting ntfs_rl_set() to
5417		 * only operate up to the initialized size as it could
5418		 * otherwise do a lot of unneeded extra work.
5419		 */
5420		if (vcn << vol->cluster_size_shift < initialized_size) {
5421			ntfs_debug("Zeroing instantiated hole inside the "
5422					"initialized size.");
5423			if (!runlist.elements || !runlist.alloc)
5424				panic("%s(): !runlist.elements || "
5425						"!runlist.alloc\n",
5426						__FUNCTION__);
5427			err = ntfs_rl_set(vol, runlist.rl, 0);
5428			if (err) {
5429				ntfs_error(vol->mp, "Failed to zero newly "
5430						"allocated space (error %d).",
5431						err);
5432				goto undo_alloc;
5433			}
5434		}
5435		err = ntfs_rl_merge(&ni->rl, &runlist);
5436		if (err) {
5437			ntfs_error(vol->mp, "Failed to merge runlists (error "
5438					"%d).", err);
5439			goto undo_alloc;
5440		}
5441		/*
5442		 * The runlist may have been reallocated so @rl needs to be
5443		 * reset back to the beginning.
5444		 */
5445		rl = ni->rl.rl;
5446		/*
5447		 * Need to update the mapping pairs array of the attribute.  We
5448		 * cannot postpone this till the end (which would be much more
5449		 * efficient) because we could run out of space on the volume
5450		 * when trying to update the mapping pairs array and then we
5451		 * would not be able to roll back to the previous state because
5452		 * we would not know which bits of the runlist are new and
5453		 * which are old.  Doing it now means that if we get an error
5454		 * we still know the starting and ending VCNs of the run we
5455		 * instantiated so we can punch the clusters out again thus
5456		 * restoring the original hole.
5457		 */
5458		err = ntfs_mft_record_map(base_ni, &base_m);
5459		if (err) {
5460			ntfs_error(vol->mp, "Failed to map mft_no 0x%llx "
5461					"(error %d).",
5462					(unsigned long long)base_ni->mft_no,
5463					err);
5464			goto undo_merge;
5465		}
5466		ctx = ntfs_attr_search_ctx_get(base_ni, base_m);
5467		if (!ctx) {
5468			ntfs_error(vol->mp, "Failed to allocate attribute "
5469					"search context.");
5470			err = ENOMEM;
5471			goto unm_err;
5472		}
5473		/*
5474		 * Get the base attribute record so we can update the
5475		 * compressed size or so we can switch the attribute to not be
5476		 * sparse any more if we just filled the last hole.
5477		 */
5478		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 0,
5479				NULL, 0, ctx);
5480		if (err) {
5481			ntfs_error(vol->mp, "Failed to lookup base attribute "
5482					"extent in mft_no 0x%llx (error %d).",
5483					(unsigned long long)base_ni->mft_no,
5484					err);
5485			goto put_err;
5486		}
5487		m = ctx->m;
5488		a = ctx->a;
5489		/*
5490		 * We added @len clusters thus the compressed size grows by
5491		 * that many clusters whilst the allocated size does not change
5492		 * as we have not extended the attribute.
5493		 */
5494		compressed_size += len << vol->cluster_size_shift;
5495		/*
5496		 * Determine whether the attribute is still sparse by comparing
5497		 * the new compressed size to the allocated size.  If the two
5498		 * have now become the same the attribute is no longer sparse.
5499		 */
5500		if (compressed_size >= allocated_size) {
5501			if (compressed_size != allocated_size)
5502				panic("%s(): compressed_size != "
5503						"allocated_size\n",
5504						__FUNCTION__);
5505			/* Switch the attribute to not be sparse any more. */
5506			ntfs_attr_sparse_clear(base_ni, ni, ctx);
5507		}
5508		/*
5509		 * If the attribute is (still) sparse or compressed, need to
5510		 * update the compressed size.
5511		 */
5512		if (NInoSparse(ni) || NInoCompressed(ni)) {
5513			lck_spin_lock(&ni->size_lock);
5514			ni->compressed_size = compressed_size;
5515			a->compressed_size = cpu_to_sle64(compressed_size);
5516			lck_spin_unlock(&ni->size_lock);
5517		}
5518		/*
5519		 * If this is the unnamed $DATA attribute also need to update
5520		 * the sizes in the directory entries pointing to this inode.
5521		 */
5522		if (ni == base_ni)
5523			NInoSetDirtySizes(ni);
5524		/*
5525		 * If the VCN we started allocating at is not in the base
5526		 * attribute record get the attribute record containing it so
5527		 * we can update the mapping pairs array.
5528		 */
5529		if (vcn > sle64_to_cpu(a->highest_vcn)) {
5530			/* Ensure the modified mft record is written out. */
5531			NInoSetMrecNeedsDirtying(ctx->ni);
5532			err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
5533					vcn, NULL, 0, ctx);
5534			if (err) {
5535				ntfs_error(vol->mp, "Failed to lookup "
5536						"attribute extent in mft_no "
5537						"0x%llx (error %d).",
5538						(unsigned long long)
5539						base_ni->mft_no, err);
5540				a = NULL;
5541				goto undo_sparse;
5542			}
5543			a = ctx->a;
5544		}
5545		/*
5546		 * Get the size for the new mapping pairs array for this
5547		 * attribute extent.
5548		 */
5549		lowest_vcn = sle64_to_cpu(a->lowest_vcn);
5550		/*
5551		 * Get the runlist element containing the lowest vcn.
5552		 *
5553		 * This cannot fail as we know the runlist is ok and the
5554		 * runlist fragment containing the lowest vcn is mapped.
5555		 */
5556		rl2 = ntfs_rl_find_vcn_nolock(rl, lowest_vcn);
5557		if (!rl2)
5558			panic("%s(): Memory corruption detected.\n",
5559					__FUNCTION__);
5560		err = ntfs_get_size_for_mapping_pairs(vol, rl2, lowest_vcn,
5561				highest_vcn, &mp_size);
5562		if (err) {
5563			ntfs_error(vol->mp, "Failed to get size for mapping "
5564					"pairs array (error %d).", err);
5565			goto undo_sparse;
5566		}
5567		mp_ofs = le16_to_cpu(a->mapping_pairs_offset);
5568retry_attr_rec_resize:
5569		/*
5570		 * Extend the attribute record to fit the bigger mapping pairs
5571		 * array.
5572		 */
5573		err = ntfs_attr_record_resize(m, a, mp_size + mp_ofs);
5574		if (!err)
5575			goto build_mpa;
5576		if (err != ENOSPC)
5577			panic("%s(): err != ENOSPC\n", __FUNCTION__);
5578		/*
5579		 * There is not enough space in the mft record.
5580		 *
5581		 * We need to add an attribute list attribute if it is not
5582		 * already present.
5583		 */
5584		if (!NInoAttrList(base_ni)) {
5585			err = ntfs_attr_list_add(base_ni, base_m, ctx);
5586			if (err || ctx->is_error) {
5587				if (!err)
5588					err = ctx->error;
5589				ntfs_error(vol->mp, "Failed to %s mft_no "
5590						"0x%llx (error %d).",
5591						ctx->is_error ?
5592						"remap extent mft record of" :
5593						"add attribute list attribute "
5594						"to", (unsigned long long)
5595						base_ni->mft_no, err);
5596				goto undo1;
5597			}
5598			/*
5599			 * The attribute location will have changed so update
5600			 * it from the search context.
5601			 */
5602			m = ctx->m;
5603			a = ctx->a;
5604			/*
5605			 * Retry the original attribute record resize as we may
5606			 * now have enough space to create the needed mapping
5607			 * pairs array in the moved attribute record.
5608			 *
5609			 * This can for example happen when the attribute was
5610			 * moved out to an extent mft record which has much
5611			 * more free space than the base mft record had.
5612			 */
5613			goto retry_attr_rec_resize;
5614		}
5615		/*
5616		 * If this is not the only attribute record in the mft record
5617		 * then move it out to a new extent mft record which will allow
5618		 * the attribute record to grow larger thus reducing the total
5619		 * number of extent attribute records needed to a minimum.
5620		 */
5621		if (!ntfs_attr_record_is_only_one(m, a)) {
5622			lck_rw_lock_shared(&base_ni->attr_list_rl.lock);
5623			err = ntfs_attr_record_move(ctx);
5624			lck_rw_unlock_shared(&base_ni->attr_list_rl.lock);
5625			if (err) {
5626				ntfs_error(vol->mp, "Failed to move attribute "
5627						"extent from mft record "
5628						"0x%llx to an extent mft "
5629						"record (error %d).",
5630						(unsigned long long)
5631						ctx->ni->mft_no, err);
5632				/*
5633				 * We could try to remove the attribute list
5634				 * attribute if we added it above but this
5635				 * would probably require attributes to be
5636				 * moved back into the base mft record from
5637				 * extent mft records so is a lot of work and
5638				 * given we are in an error code path and given
5639				 * that it is ok to just leave the inode with
5640				 * an attribute list attribute we do not bother
5641				 * and just bail out.
5642				 */
5643				goto undo1;
5644			}
5645			/*
5646			 * The attribute location will have changed so update
5647			 * it from the search context.
5648			 */
5649			m = ctx->m;
5650			a = ctx->a;
5651			/*
5652			 * Retry the original attribute record resize as we may
5653			 * now have enough space to create the mapping pairs
5654			 * array in the moved attribute record.
5655			 */
5656			goto retry_attr_rec_resize;
5657		}
5658		max_size = (le32_to_cpu(m->bytes_allocated) -
5659				le32_to_cpu(m->bytes_in_use)) & ~7;
5660		max_size += le32_to_cpu(a->length) - mp_ofs;
5661		err = ntfs_attr_record_resize(m, a, max_size + mp_ofs);
5662		/*
5663		 * We worked out the exact size we can extend to so the resize
5664		 * cannot fail.
5665		 */
5666		if (err)
5667			panic("%s(): err (ntfs_attr_record_resize())\n",
5668					__FUNCTION__);
5669build_mpa:
5670// TODO: HERE...
5671		mp_rebuilt = TRUE;
5672		/*
5673		 * Generate the mapping pairs array directly into the attribute
5674		 * record.
5675		 *
5676		 * This cannot fail as we have already checked the size we need
5677		 * to build the mapping pairs array.
5678		 */
5679		err = ntfs_mapping_pairs_build(vol, (s8*)a + mp_ofs,
5680				le32_to_cpu(a->length) - mp_ofs, rl2,
5681				lowest_vcn, highest_vcn, &stop_vcn);
5682		if (err && err != ENOSPC) {
5683			ntfs_error(vol->mp, "Cannot fill hole of mft_no "
5684					"0x%llx, attribute type 0x%x, because "
5685					"building the mapping pairs array "
5686					"failed (error %d).",
5687					(unsigned long long)ni->mft_no,
5688					(unsigned)le32_to_cpu(ni->type), err);
5689			err = EIO;
5690			/*
5691			 * Need to set @a->highest_vcn to enable correct error
5692			 * recovery.
5693			 */
5694// TODO: HERE...
5695			if (!is_first)
5696				a->highest_vcn = cpu_to_sle64(sle64_to_cpu(
5697						a->lowest_vcn) - 1);
5698			goto undo;
5699		}
5700		/* Update the highest_vcn. */
5701		a->highest_vcn = cpu_to_sle64(stop_vcn - 1);
5702		/* Ensure the modified mft record is written out. */
5703		NInoSetMrecNeedsDirtying(ctx->ni);
5704		/*
5705		 * If the mapping pairs build succeeded, i.e. the current
5706		 * attribute extent contains the whole runlist fragment, we are
5707		 * done and can proceed to the next run.
5708		 */
5709		if (!err)
5710			goto next_run;
5711		/*
5712		 * Partial mapping pairs update.  This means we need to create
5713		 * one or me new attribute extents to hold the remainder of the
5714		 * mapping pairs.
5715		 *
5716		 * Get the size of the remaining mapping pairs array.
5717		 */
5718		rl2 = ntfs_rl_find_vcn_nolock(rl2, stop_vcn);
5719		if (!rl2)
5720			panic("%s(): !rl2 (stop_vcn)\n", __FUNCTION__);
5721		if (!rl2->length)
5722			panic("%s(): !rl2->length (stop_vcn)\n", __FUNCTION__);
5723		if (rl2->lcn < LCN_HOLE)
5724			panic("%s(): rl2->lcn < LCN_HOLE (stop_vcn)\n",
5725					__FUNCTION__);
5726		err = ntfs_get_size_for_mapping_pairs(vol, rl2, stop_vcn,
5727				highest_vcn, &mp_size);
5728		if (err) {
5729			ntfs_error(vol->mp, "Cannot complete filling of hole "
5730					"of mft_no 0x%llx, attribute type "
5731					"0x%x, because determining the size "
5732					"for the mapping pairs failed (error "
5733					"%d).", (unsigned long long)ni->mft_no,
5734					(unsigned)le32_to_cpu(ni->type), err);
5735			err = EIO;
5736// TODO: HERE...
5737			goto undo;
5738		}
5739		/* We only release extent mft records. */
5740		if (ctx->ni != base_ni)
5741			ntfs_extent_mft_record_unmap(ctx->ni);
5742// TODO: I AM HERE...  Need to allocate an extent mft record, add an extent
5743// attribute record to it filling it with remaining mapping pairs array fragment
5744// and creating an attribute list attribute entry for it.  Then if still not
5745// reached highest_vcn, need to repeat the process again.
5746next_run:
5747		ntfs_attr_search_ctx_put(ctx);
5748		ntfs_mft_record_unmap(base_ni);
5749		/*
5750		 * If the attribute is no longer sparse there are no more holes
5751		 * to instantiate thus we are done with the whole region of
5752		 * interest.
5753		 */
5754		if (!NInoSparse(ni)) {
5755			vcn = end_vcn;
5756			break;
5757		}
5758		/*
5759		 * We allocated @len clusters starting at @vcn.  Thus the next
5760		 * VCN we need to look at is at @vcn + @len.
5761		 */
5762		vcn += len;
5763	} while (vcn < end_vcn);
5764	if (vcn > end_vcn)
5765		panic("%s(): vcn > end_vcn\n", __FUNCTION__);
5766	ntfs_debug("Done, new_end 0x%llx.",
5767			(unsigned long long)vcn << vol->cluster_size_shift);
5768err:
5769	if (new_end)
5770		*new_end = vcn << vol->cluster_size_shift;
5771	if (write_locked)
5772		lck_rw_unlock_exclusive(&ni->rl.lock);
5773	else
5774		lck_rw_unlock_shared(&ni->rl.lock);
5775	return err;
5776undo_alloc:
5777	err2 = ntfs_cluster_free_from_rl(vol, runlist.rl, 0, -1, NULL);
5778	if (err2) {
5779		ntfs_error(vol->mp, "Failed to release allocated cluster(s) "
5780				"in error code path (error %d).  Run chkdsk "
5781				"to recover the lost space.", err2);
5782		NVolSetErrors(vol);
5783	}
5784	OSFree(runlist.rl, runlist.alloc, ntfs_malloc_tag);
5785	goto err;
5786undo_sparse:
5787	/*
5788	 * If looking up an attribute extent failed or we are not in the base
5789	 * attribute record need to look up the base attribute record.
5790	 */
5791	if (!a || a->lowest_vcn) {
5792		ntfs_attr_search_ctx_reinit(ctx);
5793		err2 = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 0,
5794				NULL, 0, ctx);
5795		if (err2) {
5796			ntfs_error(vol->mp, "Failed to re-lookup base "
5797					"attribute record in error code path "
5798					"(error %d).  Leaving inconsistent "
5799					"metadata.  Unmount and run chkdsk.",
5800					err2);
5801			NVolSetErrors(vol);
5802			goto put_err;
5803		}
5804		a = ctx->a;
5805	}
5806	/*
5807	 * If we caused the attribute to no longer be sparse we need to make it
5808	 * sparse again.
5809	 */
5810	if (!NInoSparse(ni)) {
5811		err2 = ntfs_attr_sparse_set(base_ni, ni, ctx);
5812		if (err2) {
5813			ntfs_error(vol->mp, "Failed to re-set the attribute "
5814					"to be sparse in error code path "
5815					"(error %d).  Leaving inconsistent "
5816					"metadata.  Unmount and run chkdsk.",
5817					err2);
5818			NVolSetErrors(vol);
5819			goto put_err;
5820		}
5821		/*
5822		 * The attribute may have been moved to make space for the
5823		 * compressed size so @a is now invalid.
5824		 */
5825		a = ctx->a;
5826	}
5827	/* Restore the compressed size to the old value. */
5828	compressed_size -= len << vol->cluster_size_shift;
5829	lck_spin_lock(&ni->size_lock);
5830	ni->compressed_size = compressed_size;
5831	a->compressed_size = cpu_to_sle64(compressed_size);
5832	lck_spin_unlock(&ni->size_lock);
5833	/* Ensure the modified mft record is written out. */
5834	NInoSetMrecNeedsDirtying(ctx->ni);
5835	if (ni == base_ni)
5836		NInoSetDirtySizes(ni);
5837put_err:
5838	ntfs_attr_search_ctx_put(ctx);
5839unm_err:
5840	ntfs_mft_record_unmap(base_ni);
5841undo_merge:
5842	/* Free the clusters we allocated. */
5843	err2 = ntfs_cluster_free_from_rl(vol, rl, vcn, len, NULL);
5844	if (err2) {
5845		ntfs_error(vol->mp, "Failed to release allocated cluster(s) "
5846				"in error code path (error %d).  Unmount and "
5847				"run chkdsk to recover the lost space.", err2);
5848		NVolSetErrors(vol);
5849	}
5850	/* Punch the original hole back into the runlist. */
5851	err2 = ntfs_rl_punch_nolock(vol, &ni->rl, vcn, len);
5852	if (err2) {
5853		ntfs_error(vol->mp, "Failed to restore hole in error code "
5854				"path in error code path (error %d).  Leaving "
5855				"inconsistent metadata.  Unmount and run "
5856				"chkdsk.", err2);
5857		NVolSetErrors(vol);
5858	}
5859	goto err;
5860undo1:
5861	panic("%s(): TODO\n", __FUNCTION__);
5862	return err;
5863#endif
5864}
5865
5866/**
5867 * ntfs_attr_extend_allocation - extend the allocated space of an attribute
5868 * @ni:			ntfs inode of the attribute whose allocation to extend
5869 * @new_alloc_size:	new size in bytes to which to extend the allocation to
5870 * @new_data_size:	new size in bytes to which to extend the data to
5871 * @data_start:		beginning of region which is required to be non-sparse
5872 * @ictx:		index context
5873 * @dst_alloc_size:	if not NULL, this pointer is set to the allocated size
5874 * @atomic:		if true must complete the entire exension or abort
5875 *
5876 * Extend the allocated space of an attribute described by the ntfs inode @ni
5877 * to @new_alloc_size bytes.  If @data_start is -1, the whole extension may be
5878 * implemented as a hole in the file (as long as both the volume and the ntfs
5879 * inode @ni have sparse support enabled).  If @data_start is >= 0, then the
5880 * region between the old allocated size and @data_start - 1 may be made sparse
5881 * but the regions between @data_start and @new_alloc_size must be backed by
5882 * actual clusters.
5883 *
5884 * If @new_data_size is -1, it is ignored.  If it is >= 0, then the data size
5885 * of the attribute is extended to @new_data_size and the UBC size of the VFS
5886 * vnode is updated to match.
5887 * WARNING: It is a bug for @new_data_size to be smaller than the old data size
5888 * as well as for @new_data_size to be greater than @new_alloc_size.
5889 *
5890 * If @ictx is not NULL, the extension is for an index allocation or bitmap
5891 * attribute extension.  In this case, if there is not enough space in the mft
5892 * record for the extended index allocation/bitmap attribute, the index root is
5893 * moved to an index block if it is not empty to create more space in the mft
5894 * record.  NOTE: At present @ictx is only set when the attribute being resized
5895 * is non-resident.
5896 *
5897 * If @atomic is true only return success if the entire extension is complete.
5898 * If only a partial extension is possible abort with an appropriate error.  If
5899 * @atomic is false partial extensions are acceptable in certain circumstances
5900 * (see below).
5901 *
5902 * For resident attributes extending the allocation involves resizing the
5903 * attribute record and if necessary moving it and/or other attributes into
5904 * extent mft records and/or converting the attribute to a non-resident
5905 * attribute which in turn involves extending the allocation of a non-resident
5906 * attribute as described below.
5907 *
5908 * For non-resident attributes this involves allocating clusters in the data
5909 * zone on the volume (except for regions that are being made sparse) and
5910 * extending the run list to describe the allocated clusters as well as
5911 * updating the mapping pairs array of the attribute.  This in turn involves
5912 * resizing the attribute record and if necessary moving it and/or other
5913 * attributes into extent mft records and/or splitting the attribute record
5914 * into multiple extent attribute records.
5915 *
5916 * Also, the attribute list attribute is updated if present and in some of the
5917 * above cases (the ones where extent mft records/attributes come into play),
5918 * an attribute list attribute is created if not already present.
5919 *
5920 * Return 0 on success and errno on error.
5921 *
5922 * In the case that an error is encountered but a partial extension at least up
5923 * to @data_start (if present) is possible, the allocation is partially
5924 * extended and success is returned.  If @data_start is -1 then partial
5925 * allocations are not performed.
5926 *
5927 * If @dst_alloc_size is not NULL, then *@dst_alloc_size is set to the new
5928 * allocated size when the ntfs_attr_extend_allocation() returns success.  If
5929 * an error is returned *@dst_alloc_size is undefined.  This is useful so that
5930 * the caller has a simple way of checking whether or not the allocation was
5931 * partial.
5932 *
5933 * Thus if @data_start is not -1 the caller should supply @dst_alloc_size and
5934 * then compare *@dst_alloc_size to @new_alloc_size to determine if the
5935 * allocation was partial.  And if @data_start is -1 there is no point in
5936 * supplying @dst_alloc_size as *@dst_alloc_size will always be equal to
5937 * @new_alloc_size.
5938 *
5939 * Locking: - Caller must hold @ni->lock on the inode for writing.
5940 *	    - The runlist @ni must be unlocked as it is taken for writing.
5941 */
5942errno_t ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size,
5943		const s64 new_data_size, const s64 data_start,
5944		ntfs_index_context *ictx, s64 *dst_alloc_size,
5945		const BOOL atomic)
5946{
5947	VCN vcn, lowest_vcn, stop_vcn;
5948	s64 start, ll, old_alloc_size, alloc_size, alloc_start, alloc_end;
5949	s64 nr_allocated, nr_freed;
5950	ntfs_volume *vol = ni->vol;
5951	ntfs_inode *base_ni;
5952	MFT_RECORD *base_m, *m;
5953	ATTR_RECORD *a;
5954	ntfs_attr_search_ctx *actx;
5955	ntfs_rl_element *rl;
5956	unsigned attr_len, arec_size, name_size, mp_size, mp_ofs, max_size;
5957	unsigned al_entry_len, new_al_alloc;
5958	errno_t err, err2;
5959	BOOL is_sparse, is_first, mp_rebuilt, al_entry_added;
5960	ntfs_runlist runlist;
5961
5962	start = data_start;
5963#ifdef DEBUG
5964	lck_spin_lock(&ni->size_lock);
5965	old_alloc_size = ni->allocated_size;
5966	lck_spin_unlock(&ni->size_lock);
5967	ntfs_debug("Entering for mft_no 0x%llx, attribute type 0x%x, "
5968			"old_allocated_size 0x%llx, "
5969			"new_allocated_size 0x%llx, new_data_size 0x%llx, "
5970			"data_start 0x%llx.", (unsigned long long)ni->mft_no,
5971			(unsigned)le32_to_cpu(ni->type),
5972			(unsigned long long)old_alloc_size,
5973			(unsigned long long)new_alloc_size,
5974			(unsigned long long)new_data_size,
5975			(unsigned long long)start);
5976#endif
5977	/* This cannot be called for the attribute list attribute. */
5978	if (ni->type == AT_ATTRIBUTE_LIST)
5979		panic("%s(): ni->type == AT_ATTRIBUTE_LIST\n", __FUNCTION__);
5980	name_size = ni->name_len * sizeof(ntfschar);
5981	base_ni = ni;
5982	if (NInoAttr(ni))
5983		base_ni = ni->base_ni;
5984	is_first = TRUE;
5985retry_extend:
5986	/*
5987	 * For non-resident attributes, @start and @new_size need to be aligned
5988	 * to cluster boundaries for allocation purposes.
5989	 */
5990	if (NInoNonResident(ni)) {
5991		if (start > 0)
5992			start &= ~(s64)vol->cluster_size_mask;
5993		new_alloc_size = (new_alloc_size + vol->cluster_size - 1) &
5994				~(s64)vol->cluster_size_mask;
5995	}
5996	if (new_data_size >= 0 && new_data_size > new_alloc_size)
5997		panic("%s(): new_data_size >= 0 && new_data_size > "
5998				"new_alloc_size\n", __FUNCTION__);
5999	/* Check if new size is allowed in $AttrDef. */
6000	err = ntfs_attr_size_bounds_check(vol, ni->type, new_alloc_size);
6001	if (err) {
6002		/* Only emit errors when the write will fail completely. */
6003		lck_spin_lock(&ni->size_lock);
6004		old_alloc_size = ni->allocated_size;
6005		lck_spin_unlock(&ni->size_lock);
6006		if (start < 0 || start >= old_alloc_size) {
6007			if (err == ERANGE) {
6008				ntfs_error(vol->mp, "Cannot extend allocation "
6009						"of mft_no 0x%llx, attribute "
6010						"type 0x%x, because the new "
6011						"allocation would exceed the "
6012						"maximum allowed size for "
6013						"this attribute type.",
6014						(unsigned long long)ni->mft_no,
6015						(unsigned)
6016						le32_to_cpu(ni->type));
6017			} else {
6018				ntfs_error(vol->mp, "Cannot extend allocation "
6019						"of mft_no 0x%llx, attribute "
6020						"type 0x%x, because this "
6021						"attribute type is not "
6022						"defined on the NTFS volume.  "
6023						"Possible corruption!  You "
6024						"should run chkdsk!",
6025						(unsigned long long)ni->mft_no,
6026						(unsigned)
6027						le32_to_cpu(ni->type));
6028			}
6029		}
6030		/* Translate error code to be POSIX conformant for write(2). */
6031		if (err == ERANGE)
6032			err = EFBIG;
6033		else
6034			err = EIO;
6035		return err;
6036	}
6037	/*
6038	 * We will be modifying both the runlist (if non-resident) and the mft
6039	 * record so lock them both down.
6040	 */
6041	lck_rw_lock_exclusive(&ni->rl.lock);
6042	err = ntfs_mft_record_map(base_ni, &base_m);
6043	if (err) {
6044		base_m = NULL;
6045		actx = NULL;
6046		goto err_out;
6047	}
6048	actx = ntfs_attr_search_ctx_get(base_ni, base_m);
6049	if (!actx) {
6050		err = ENOMEM;
6051		goto err_out;
6052	}
6053	lck_spin_lock(&ni->size_lock);
6054	alloc_size = ni->allocated_size;
6055	lck_spin_unlock(&ni->size_lock);
6056	/*
6057	 * If non-resident, seek to the last extent.  If resident, there is
6058	 * only one extent, so seek to that.
6059	 */
6060	vcn = (NInoNonResident(ni) && alloc_size > 0) ?
6061			(alloc_size - 1) >> vol->cluster_size_shift : 0;
6062	/*
6063	 * Abort if someone did the work whilst we waited for the locks.  If we
6064	 * just converted the attribute from resident to non-resident it is
6065	 * likely that exactly this has happened already.  We cannot quite
6066	 * abort if we need to update the data size.
6067	 */
6068	if (new_alloc_size <= alloc_size) {
6069		ntfs_debug("Allocated size already exceeds requested size.");
6070		new_alloc_size = alloc_size;
6071		if (new_data_size < 0)
6072			goto done;
6073		/*
6074		 * We want the first attribute extent so that we can update the
6075		 * data size.
6076		 */
6077		vcn = 0;
6078	}
6079	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, vcn, NULL, 0,
6080			actx);
6081	if (err) {
6082		if (err == ENOENT)
6083			err = EIO;
6084		goto err_out;
6085	}
6086	m = actx->m;
6087	a = actx->a;
6088	/* Use goto to reduce indentation. */
6089	if (a->non_resident)
6090		goto do_non_resident_extend;
6091	if (NInoNonResident(ni))
6092		panic("%s(): NInoNonResident(ni)\n", __FUNCTION__);
6093	/*
6094	 * As things are now this function should never be called with an index
6095	 * context for the resize of a resident attribute.
6096	 */
6097	if (ictx)
6098		panic("%s(): ictx\n", __FUNCTION__);
6099	/* The total length of the attribute value. */
6100	attr_len = le32_to_cpu(a->value_length);
6101	/*
6102	 * Extend the attribute record to be able to store the new attribute
6103	 * size.  ntfs_attr_record_resize() will not do anything if the size is
6104	 * not changing.
6105	 */
6106	arec_size = (le16_to_cpu(a->value_offset) + new_alloc_size + 7) & ~7;
6107	if (arec_size < le32_to_cpu(m->bytes_allocated) -
6108			le32_to_cpu(m->bytes_in_use) &&
6109			!ntfs_attr_record_resize(m, a, arec_size)) {
6110		/* The resize succeeded! */
6111		if (new_data_size > attr_len) {
6112			if (!ubc_setsize(ni->vn, new_data_size)) {
6113				ntfs_error(vol->mp, "Failed to set size in "
6114						"UBC.");
6115				/*
6116				 * This cannot fail as it is a shrinking
6117				 * resize.
6118				 */
6119				lck_spin_lock(&ni->size_lock);
6120				err = ntfs_attr_record_resize(m, a,
6121						le16_to_cpu(a->value_offset) +
6122						ni->allocated_size);
6123				lck_spin_unlock(&ni->size_lock);
6124				if (err)
6125					panic("%s(): Failed to shrink "
6126							"resident attribute "
6127							"record (error %d)\n",
6128							__FUNCTION__, err);
6129				err = EIO;
6130				goto err_out;
6131			}
6132			/* Zero the extended attribute value. */
6133			bzero((u8*)a + le16_to_cpu(a->value_offset) + attr_len,
6134					(u32)new_data_size - attr_len);
6135			lck_spin_lock(&ni->size_lock);
6136			ni->initialized_size = ni->data_size = new_data_size;
6137			a->value_length = cpu_to_le32((u32)new_data_size);
6138		} else
6139			lck_spin_lock(&ni->size_lock);
6140		ni->allocated_size = le32_to_cpu(a->length) -
6141				le16_to_cpu(a->value_offset);
6142		lck_spin_unlock(&ni->size_lock);
6143		if (new_data_size > attr_len)
6144			a->value_length = cpu_to_le32((u32)new_data_size);
6145		goto dirty_done;
6146	}
6147	/*
6148	 * We have to drop all the locks so we can call
6149	 * ntfs_attr_make_non_resident().
6150	 */
6151	ntfs_attr_search_ctx_put(actx);
6152	ntfs_mft_record_unmap(base_ni);
6153	lck_rw_unlock_exclusive(&ni->rl.lock);
6154	/*
6155	 * Not enough space in the mft record, try to make the attribute
6156	 * non-resident and if successful restart the extension process.
6157	 */
6158	err = ntfs_attr_make_non_resident(ni);
6159	if (!err)
6160		goto retry_extend;
6161	/*
6162	 * Could not make non-resident.  If this is due to this not being
6163	 * permitted for this attribute type try to make other attributes
6164	 * non-resident and/or move this or other attributes out of the mft
6165	 * record this attribute is in.  Otherwise fail.
6166	 */
6167	if (err != EPERM) {
6168		if (err != ENOSPC) {
6169			/*
6170			 * Only emit errors when the write will fail
6171			 * completely.
6172			 */
6173			lck_spin_lock(&ni->size_lock);
6174			old_alloc_size = ni->allocated_size;
6175			lck_spin_unlock(&ni->size_lock);
6176			if (start < 0 || start >= old_alloc_size)
6177				ntfs_error(vol->mp, "Cannot extend allocation "
6178						"of mft_no 0x%llx, attribute "
6179						"type 0x%x, because the "
6180						"conversion from resident to "
6181						"non-resident attribute "
6182						"failed (error %d).",
6183						(unsigned long long)ni->mft_no,
6184						(unsigned)le32_to_cpu(ni->type),
6185						err);
6186			if (err != ENOMEM) {
6187				NVolSetErrors(vol);
6188				err = EIO;
6189			}
6190		}
6191		goto conv_err_out;
6192	}
6193	/*
6194	 * To make space in the mft record we would like to try to make other
6195	 * attributes non-resident if that would save space.
6196	 *
6197	 * FIXME: We cannot do this at present unless the attribute is the
6198	 * attribute being resized as there could be an ntfs inode matching
6199	 * this attribute in memory and it would become out of date with its
6200	 * metadata if we touch its attribute record.
6201	 *
6202	 * FIXME: We do not need to do this if this is the attribute being
6203	 * resized as we already tried to make the attribute non-resident and
6204	 * it did not work or we would never have gotten here in the first
6205	 * place.
6206	 *
6207	 * Thus we have to either move other attributes to extent mft records
6208	 * thus making more space in the base mft record or we have to move the
6209	 * attribute being resized to an extent mft record thus giving it more
6210	 * space.  In any case we need to have an attribute list attribute so
6211	 * start by adding it if it does not yet exist.
6212	 *
6213	 * Before we start, we can check whether it is possible to fit the
6214	 * attribute to be resized inside an mft record.  If not then there is
6215	 * no point in proceeding.
6216	 *
6217	 * This should never really happen as the attribute size should never
6218	 * be allowed to grow so much and such requests should never be made by
6219	 * the driver and if they are they should be caught by the call to
6220	 * ntfs_attr_size_bounds_check().
6221	 */
6222	if (arec_size > vol->mft_record_size - sizeof(MFT_RECORD)) {
6223		/* Only emit errors when the write will fail completely. */
6224		lck_spin_lock(&ni->size_lock);
6225		old_alloc_size = ni->allocated_size;
6226		lck_spin_unlock(&ni->size_lock);
6227		if (start < 0 || start >= old_alloc_size)
6228			ntfs_error(vol->mp, "Cannot extend allocation of "
6229					"mft_no 0x%llx, attribute type 0x%x, "
6230					"because the attribute may not be "
6231					"non-resident and the requested size "
6232					"exceeds the maximum possible "
6233					"resident attribute record size.",
6234					(unsigned long long)ni->mft_no,
6235					(unsigned)le32_to_cpu(ni->type));
6236		/* Use POSIX conformant write(2) error code. */
6237		err = EFBIG;
6238		goto conv_err_out;
6239	}
6240	/*
6241	 * The resident attribute can fit in an mft record.  Now have to decide
6242	 * whether to make other attributes non-resident/move other attributes
6243	 * out of the mft record or whether to move the attribute record to be
6244	 * resized out to a new mft record.
6245	 *
6246	 * TODO: We never call ntfs_attr_extend_allocation() for attributes
6247	 * that cannot be non-resident thus we never get here thus we simply
6248	 * panic() here to remind us that we need to implement this code if we
6249	 * ever start calling this function for attributes that must remain
6250	 * resident.
6251	 */
6252	panic("%s(): Attribute may not be non-resident.\n", __FUNCTION__);
6253do_non_resident_extend:
6254	if (!NInoNonResident(ni))
6255		panic("%s(): !NInoNonResident(ni)\n", __FUNCTION__);
6256	if (new_alloc_size == alloc_size) {
6257		if (vcn)
6258			panic("%s(): vcn\n", __FUNCTION__);
6259		goto alloc_done;
6260	}
6261	/*
6262	 * We are going to allocate starting at the old allocated size and are
6263	 * going to allocate up to the new allocated size.
6264	 */
6265	alloc_start = alloc_size;
6266	rl = NULL;
6267	if (ni->rl.elements) {
6268		/* Seek to the end of the runlist. */
6269		rl = &ni->rl.rl[ni->rl.elements - 1];
6270	}
6271	/*
6272	 * Cache the lowest VCN for later.  Need to do it here to silence
6273	 * compiler warning about possible use of uninitialiezd variable.
6274	 */
6275	lowest_vcn = sle64_to_cpu(a->lowest_vcn);
6276	/* If this attribute extent is not mapped, map it now. */
6277	if (alloc_size > 0 && (!ni->rl.elements ||
6278			rl->lcn == LCN_RL_NOT_MAPPED ||
6279			(rl->lcn == LCN_ENOENT && rl > ni->rl.rl &&
6280			(rl-1)->lcn == LCN_RL_NOT_MAPPED))) {
6281		err = ntfs_mapping_pairs_decompress(vol, a, &ni->rl);
6282		if (err || !ni->rl.elements) {
6283			if (!err)
6284				err = EIO;
6285			if (start < 0 || start >= alloc_size)
6286				ntfs_error(vol->mp, "Cannot extend allocation "
6287						"of mft_no 0x%llx, attribute "
6288						"type 0x%x, because the "
6289						"mapping of a runlist "
6290						"fragment failed (error %d).",
6291						(unsigned long long)ni->mft_no,
6292						(unsigned)le32_to_cpu(ni->type),
6293						err);
6294			if (err != ENOMEM)
6295				err = EIO;
6296			goto err_out;
6297		}
6298		/* Seek to the end of the runlist. */
6299		rl = &ni->rl.rl[ni->rl.elements - 1];
6300	}
6301	/*
6302	 * We now know the runlist of the last extent is mapped and @rl is at
6303	 * the end of the runlist.  We want to begin extending the runlist.
6304	 *
6305	 * If the data starts after the end of the old allocation or no data
6306	 * start is specified (@start < 0), this is a $DATA attribute and
6307	 * sparse attributes are enabled on the volume and for this inode, then
6308	 * create a sparse region between the old allocated size and the start
6309	 * of the data or the new allocated size if no data start is specified.
6310	 * Otherwise proceed with filling the whole space between the old
6311	 * allocated size and the new allocated size with clusters.
6312	 */
6313	if ((start >= 0 && start <= alloc_size) || ni->type != AT_DATA ||
6314			!NVolSparseEnabled(vol) || NInoSparseDisabled(ni)) {
6315		is_sparse = FALSE;
6316		goto skip_sparse;
6317	}
6318	/*
6319	 * If @start is less than zero we create the sparse region from the old
6320	 * allocated size to the new allocated size.  Otherwise we end the
6321	 * sparse region at @start and fill with real clusters between @start
6322	 * and the new allocated size.
6323	 */
6324	alloc_end = start;
6325	if (start < 0)
6326		alloc_end = new_alloc_size;
6327	ntfs_debug("Adding hole starting at byte offset 0x%llx and finishing "
6328			"at byte offset 0x%llx.",
6329			(unsigned long long)alloc_start,
6330			(unsigned long long)alloc_end);
6331	/*
6332	 * Allocate more memory if needed.  We ensure there is space at least
6333	 * for two new elements as this is what needs to happen when this is
6334	 * the very first allocation, i.e. the file has zero clusters allocated
6335	 * at the moment.
6336	 */
6337	if ((ni->rl.elements + 2) * sizeof(*rl) > ni->rl.alloc) {
6338		ntfs_rl_element *rl2;
6339
6340		rl2 = OSMalloc(ni->rl.alloc + NTFS_ALLOC_BLOCK,
6341				ntfs_malloc_tag);
6342		if (!rl2) {
6343			err = ENOMEM;
6344			goto err_out;
6345		}
6346		if (ni->rl.elements) {
6347			memcpy(rl2, ni->rl.rl, ni->rl.elements * sizeof(*rl2));
6348			/* Seek to the end of the runlist. */
6349			rl = &rl2[ni->rl.elements - 1];
6350		}
6351		if (ni->rl.alloc)
6352			OSFree(ni->rl.rl, ni->rl.alloc, ntfs_malloc_tag);
6353		ni->rl.rl = rl2;
6354		ni->rl.alloc += NTFS_ALLOC_BLOCK;
6355	}
6356	if (ni->rl.elements) {
6357		/* Sanity check that this is the end element. */
6358		if (rl->length || rl->lcn >= LCN_HOLE)
6359			panic("%s(): rl->length || rl->lcn >= LCN_HOLE)\n",
6360					__FUNCTION__);
6361	} else /* if (!ni->rl.elements) */ {
6362		/*
6363		 * The runlist is empty thus we are now creating both the
6364		 * sparse element and the end element.  Thus need to set
6365		 * everything up so we end up with two new elements rather than
6366		 * one.
6367		 *
6368		 * Note we do not need to set up @rl->lcn and @rl->length as
6369		 * they are both unconditionally overwritten below.
6370		 */
6371		if (alloc_size > 0)
6372			panic("%s(): alloc_size > 0\n", __FUNCTION__);
6373		rl = ni->rl.rl;
6374		rl->vcn = 0;
6375		ni->rl.elements = 1;
6376	}
6377	/*
6378	 * If a last real element exists and it is sparse, need to extend it
6379	 * instead of adding a new hole.
6380	 *
6381	 * Replace the terminator element with a sparse element and add a new
6382	 * terminator.  We know this is the end of the attribute thus we can
6383	 * use LCN_ENOENT even if the old terminator was LCN_RL_NOT_MAPPED.
6384	 */
6385	if (rl->vcn != alloc_start >> vol->cluster_size_shift)
6386		panic("%s(): rl->vcn != alloc_start >> "
6387				"vol->cluster_size_shift\n", __FUNCTION__);
6388	if (ni->rl.elements > 1 && (rl - 1)->lcn == LCN_HOLE)
6389		rl--;
6390	else {
6391		rl->lcn = LCN_HOLE;
6392		rl[1].length = 0;
6393		ni->rl.elements++;
6394	}
6395	rl[1].vcn = alloc_end >> vol->cluster_size_shift;
6396	if (rl[1].vcn <= rl->vcn)
6397		panic("%s(): rl[1].vcn <= rl->vcn\n", __FUNCTION__);
6398	rl->length = rl[1].vcn - rl->vcn;
6399	rl[1].lcn = LCN_ENOENT;
6400	is_sparse = TRUE;
6401	/*
6402	 * If the entire extension is sparse skip the allocation of real
6403	 * clusters and proceed to updating the mapping pairs array.
6404	 */
6405	if (start < 0) {
6406		nr_allocated = 0;
6407		goto skip_real_alloc;
6408	}
6409	/*
6410	 * We allocated part of the extension as a hole, now we are going to
6411	 * allocate the remainder of the extension with real clusters.
6412	 */
6413	alloc_start = start;
6414skip_sparse:
6415	/*
6416	 * We want to begin allocating clusters starting at the last allocated
6417	 * cluster to reduce fragmentation.  If there are no valid LCNs in the
6418	 * attribute we let the cluster allocator choose the starting cluster.
6419	 *
6420	 * If the last LCN is a hole or similar seek back to last real LCN.
6421	 */
6422	if (ni->rl.elements) {
6423		while (rl->lcn < 0 && rl > ni->rl.rl)
6424			rl--;
6425	}
6426	// FIXME: Need to implement partial allocations so at least part of the
6427	// write can be performed when @start >= 0 (and hence @data_start >= 0).
6428	// This is needed for POSIX write(2) conformance.  But do not allow
6429	// partial allocations for non-DATA attributes as partial metadata is
6430	// no use.  The @start >= 0 check may be sufficient to exclude non-data
6431	// attributes...
6432	// FIXME: When we implement partial allocations we need to only allow
6433	// them to happen when @atomic is false.
6434	runlist.rl = NULL;
6435	runlist.alloc = runlist.elements = 0;
6436	nr_allocated = (new_alloc_size - alloc_start) >>
6437			vol->cluster_size_shift;
6438	err = ntfs_cluster_alloc(vol, alloc_start >> vol->cluster_size_shift,
6439			nr_allocated, (ni->rl.elements && (rl->lcn >= 0)) ?
6440			rl->lcn + rl->length : -1, DATA_ZONE, TRUE, &runlist);
6441	if (err) {
6442		if (start < 0 || start >= alloc_size)
6443			ntfs_error(vol->mp, "Cannot extend allocation of "
6444					"mft_no 0x%llx, attribute type 0x%x, "
6445					"because the allocation of clusters "
6446					"failed (error %d).",
6447					(unsigned long long)ni->mft_no,
6448					(unsigned)le32_to_cpu(ni->type), err);
6449		if (err != ENOMEM && err != ENOSPC)
6450			err = EIO;
6451		nr_allocated = 0;
6452		goto trunc_err_out;
6453	}
6454	err = ntfs_rl_merge(&ni->rl, &runlist);
6455	if (err) {
6456		if (start < 0 || start >= alloc_size)
6457			ntfs_error(vol->mp, "Cannot extend allocation of "
6458					"mft_no 0x%llx, attribute type 0x%x, "
6459					"because the runlist merge failed "
6460					"(error %d).",
6461					(unsigned long long)ni->mft_no,
6462					(unsigned)le32_to_cpu(ni->type), err);
6463		if (err != ENOMEM)
6464			err = EIO;
6465		err2 = ntfs_cluster_free_from_rl(vol, runlist.rl, 0, -1,
6466				NULL);
6467		if (err2) {
6468			ntfs_error(vol->mp, "Failed to release allocated "
6469					"cluster(s) in error code path (error "
6470					"%d).  Run chkdsk to recover the lost "
6471					"space.", err2);
6472			NVolSetErrors(vol);
6473		}
6474		OSFree(runlist.rl, runlist.alloc, ntfs_malloc_tag);
6475		nr_allocated = 0;
6476		goto trunc_err_out;
6477	}
6478	ntfs_debug("Allocated 0x%llx clusters.",
6479			(unsigned long long)(new_alloc_size - alloc_start) >>
6480			vol->cluster_size_shift);
6481skip_real_alloc:
6482	/* Find the runlist element with which the attribute extent starts. */
6483	rl = ntfs_rl_find_vcn_nolock(ni->rl.rl, lowest_vcn);
6484	if (!rl)
6485		panic("%s(): !rl\n", __FUNCTION__);
6486	if (!rl->length)
6487		panic("%s(): !rl->length\n", __FUNCTION__);
6488	if (rl->lcn < LCN_HOLE)
6489		panic("%s(): rl->lcn < LCN_HOLE\n", __FUNCTION__);
6490	mp_rebuilt = FALSE;
6491	attr_len = le32_to_cpu(a->length);
6492	/* Get the size for the new mapping pairs array for this extent. */
6493	err = ntfs_get_size_for_mapping_pairs(vol, rl, lowest_vcn, -1,
6494			&mp_size);
6495	if (err) {
6496		if (start < 0 || start >= alloc_size)
6497			ntfs_error(vol->mp, "Cannot extend allocation of "
6498					"mft_no 0x%llx, attribute type 0x%x, "
6499					"because determining the size for the "
6500					"mapping pairs failed (error %d).",
6501					(unsigned long long)ni->mft_no,
6502					(unsigned)le32_to_cpu(ni->type), err);
6503		err = EIO;
6504		goto undo_alloc;
6505	}
6506	mp_ofs = le16_to_cpu(a->mapping_pairs_offset);
6507retry_attr_rec_resize:
6508	/* Extend the attribute record to fit the bigger mapping pairs array. */
6509	err = ntfs_attr_record_resize(m, a, mp_size + mp_ofs);
6510	if (!err)
6511		goto build_mpa;
6512	if (err != ENOSPC)
6513		panic("%s(): err != ENOSPC\n", __FUNCTION__);
6514	/*
6515	 * Not enough space in the mft record.  If this is an index related
6516	 * extension, check if the index root attribute is in the same mft
6517	 * record as the attribute being extended and if it is and it is not
6518	 * empty move its entries into an index allocation block.  Note we do
6519	 * not check whether that actually creates enough space because how
6520	 * much space is needed exactly is very hard to determine in advance
6521	 * (due to potential need for associated attribute list attribute
6522	 * extensions) and also because even if it does not create enough space
6523	 * it will still help and save work later on when working for example
6524	 * on the attribute list attribute.
6525	 */
6526	if (ictx) {
6527		long delta;
6528		INDEX_ROOT *ir;
6529		INDEX_HEADER *ih;
6530		INDEX_ENTRY *ie, *first_ie;
6531		ntfs_index_context *root_ictx;
6532		ntfs_attr_search_ctx root_actx;
6533
6534		if (ni->type != AT_INDEX_ALLOCATION && ni->type != AT_BITMAP)
6535			panic("%s(): ni->type != AT_INDEX_ALLOCATION && "
6536					"ni->type != AT_BITMAP\n",
6537					__FUNCTION__);
6538		ntfs_attr_search_ctx_init(&root_actx, actx->ni, m);
6539		err = ntfs_attr_find_in_mft_record(AT_INDEX_ROOT, ni->name,
6540				ni->name_len, NULL, 0, &root_actx);
6541		if (err) {
6542			if (err != ENOENT) {
6543				ntfs_error(vol->mp, "Failed to find index "
6544						"root attribute in mft_no "
6545						"0x%llx (error %d).  Inode is "
6546						"corrupt.  Run chkdsk.",
6547						(unsigned long long)ni->mft_no,
6548						err);
6549				NVolSetErrors(vol);
6550			}
6551			/*
6552			 * The index root is in a different mft record so we
6553			 * cannot gain anything by moving out its entries.  Set
6554			 * @ictx to NULL so we do not waste our time trying
6555			 * again.
6556			 */
6557			ictx = NULL;
6558			goto ictx_done;
6559		}
6560		/*
6561		 * We found the index root in the same mft record as the
6562		 * attribute (extent) to be extended.  Check whether it is
6563		 * empty or not.
6564		 */
6565		ir = (INDEX_ROOT*)((u8*)root_actx.a +
6566				le16_to_cpu(root_actx.a->value_offset));
6567		ih = &ir->index;
6568		first_ie = ie = (INDEX_ENTRY*)((u8*)ih +
6569				le32_to_cpu(ih->entries_offset));
6570		while (!(ie->flags & INDEX_ENTRY_END))
6571			ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length));
6572		/*
6573		 * If there are no entries other than the end entry we cannot
6574		 * gain anything by moving out the entries from the index root.
6575		 * Set @ictx to NULL so we do not waste our time trying again.
6576		 */
6577		if (ie == first_ie) {
6578			ictx = NULL;
6579			goto ictx_done;
6580		}
6581		/*
6582		 * We cannot have gotten this far if the current index context
6583		 * is locked and/or it is the index root.
6584		 *
6585		 * Also, we need to undo what we have done so far as the
6586		 * metadata is currently in an inconsistent state and things
6587		 * will get really confused when moving the entries from the
6588		 * index root to the index allocation block and the same
6589		 * attribute we are extending at the moment is extended.
6590		 * Another reason is that the mft record will be dropped by the
6591		 * move thus we would expose invalid metadata to concurrent
6592		 * threads which is a Bad Thing(TM).
6593		 *
6594		 * For the same reasons we also need to drop the runlist lock
6595		 * we are holding.
6596		 */
6597		if (ictx->is_locked)
6598			panic("%s(): ictx->is_locked\n", __FUNCTION__);
6599		if (ictx->is_root)
6600			panic("%s(): ictx->is_root\n", __FUNCTION__);
6601		ll = alloc_size >> vol->cluster_size_shift;
6602		err = ntfs_cluster_free(ni, ll, -1, actx, NULL);
6603		if (err) {
6604			ntfs_error(vol->mp, "Failed to release allocated "
6605					"cluster(s) (error %d).  Run chkdsk "
6606					"to recover the lost cluster(s).", err);
6607			NVolSetErrors(vol);
6608		}
6609		m = actx->m;
6610		a = actx->a;
6611		/*
6612		 * If the runlist truncation fails and/or the search context is
6613		 * no longer valid, we cannot resize the attribute record or
6614		 * build the mapping pairs array thus we mark the volume dirty
6615		 * and tell the user to run chkdsk.
6616		 */
6617		err = ntfs_rl_truncate_nolock(vol, &ni->rl, ll);
6618		if (err || actx->is_error) {
6619			if (actx->is_error)
6620				err = actx->error;
6621			ntfs_error(vol->mp, "Failed to %s (error %d).  Run "
6622					"chkdsk.", actx->is_error ? "restore "
6623					"attribute search context" :
6624					"truncate attribute runlist", err);
6625			NVolSetErrors(vol);
6626			goto err_out;
6627		}
6628		lck_rw_unlock_exclusive(&ni->rl.lock);
6629		/* Find the index root by walking up the tree path. */
6630		root_ictx = ictx;
6631		while (!root_ictx->is_root) {
6632			root_ictx = root_ictx->up;
6633			/*
6634			 * If we go all the way round to the beginning without
6635			 * finding the root something has gone badly wrong.
6636			 */
6637			if (root_ictx == ictx)
6638				panic("%s(): root_ictx == ictx\n",
6639						__FUNCTION__);
6640		}
6641		/*
6642		 * We need a proper deallocatable attribute search context thus
6643		 * switch the one pointing to the attribute to be resized to
6644		 * point to the index root.  FIXME: We are not updating
6645		 * @actx->al_entry as this is not going to be touched at all.
6646		 * Having said that set it to NULL just in case.
6647		 */
6648		actx->a = root_actx.a;
6649		actx->al_entry = NULL;
6650		/*
6651		 * Lock the index root node.  We already have the index root
6652		 * attribute thus only need to do the revalidation part of
6653		 * re-locking.
6654		 */
6655		root_ictx->is_locked = 1;
6656		root_ictx->actx = actx;
6657		root_ictx->bytes_free = le32_to_cpu(m->bytes_allocated) -
6658				le32_to_cpu(m->bytes_in_use);
6659		root_ictx->ir = ir;
6660		delta = (u8*)ih - (u8*)root_ictx->index;
6661		if (delta) {
6662			INDEX_ENTRY **entries;
6663			unsigned u;
6664
6665			root_ictx->index = ih;
6666			root_ictx->entry = (INDEX_ENTRY*)(
6667					(u8*)root_ictx->entry + delta);
6668			entries = root_ictx->entries;
6669			for (u = 0; u < root_ictx->nr_entries; u++)
6670				entries[u] = (INDEX_ENTRY*)((u8*)
6671						entries[u] + delta);
6672		}
6673		/*
6674		 * Move the index root entries to an index allocation block.
6675		 *
6676		 * Note we do not need to worry about this causing infinite
6677		 * recursion in the case that we were called from
6678		 * ntfs_index_block_alloc() which was called from
6679		 * ntfs_index_move_root_to_allocation_block() because the
6680		 * latter will have emptied the index root before calling
6681		 * ntfs_index_block_alloc() thus we will bail out above when
6682		 * checking whether the index root is empty the second time
6683		 * round and the recursion will stop there.  This is a very
6684		 * seldom occurence thus there is no point in special casing it
6685		 * in the code in a more efficient but more complicated way.
6686		 *
6687		 * A complication is that ntfs_attr_resize() may have been
6688		 * called from ntfs_index_block_alloc() and in this case when
6689		 * we call ntfs_index_move_root_to_allocation_block() it will
6690		 * call ntfs_index_block_alloc() again which will cause a
6691		 * deadlock (or with lock debugging enabled panic()) because
6692		 * ntfs_index_block_alloc() takes the bitmap inode lock for
6693		 * writing.  To avoid this ntfs_index_block_alloc() sets
6694		 * @ictx->bmp_is_locked and we need to set
6695		 * @root_ictx->bmp_is_locoked to the same value so that when
6696		 * ntfs_index_move_root_to_allocation_block() calls
6697		 * ntfs_index_block_alloc() the latter will know not to take
6698		 * the bitmap inode lock again.
6699		 */
6700		root_ictx->bmp_is_locked = ictx->bmp_is_locked;
6701		err = ntfs_index_move_root_to_allocation_block(root_ictx);
6702		if (root_ictx != ictx)
6703			root_ictx->bmp_is_locked = 0;
6704		if (err) {
6705			ntfs_error(vol->mp, "Failed to move index root to "
6706					"index allocation block (error %d).",
6707					err);
6708			if (root_ictx->is_locked)
6709				ntfs_index_ctx_unlock(root_ictx);
6710			/*
6711			 * This is a disaster as it means the index context is
6712			 * no longer valid thus we have to bail out all the way.
6713			 */
6714			return err;
6715		}
6716		/* Unlock the newly created index block. */
6717		if (root_ictx->is_root)
6718			panic("%s(): root_ictx->is_root\n", __FUNCTION__);
6719		if (!root_ictx->is_locked)
6720			panic("%s(): !root_ictx->is_locked\n", __FUNCTION__);
6721		ntfs_index_ctx_unlock(root_ictx);
6722		/*
6723		 * We are done.  The index root is now empty thus the mft
6724		 * record should now have enough space.  Because we undid
6725		 * everything and dropped the runlist lock as well as the mft
6726		 * record when moving the index root entries into the index
6727		 * allocation block we need to restart the attribute allocation
6728		 * extension again.
6729		 *
6730		 * But first we set @ictx to NULL so we do not get here again
6731		 * in the case that there still is not enough free space.  This
6732		 * is not a disaster as we can just carry on doing other
6733		 * rearrangements to free up enough space in the mft record.
6734		 */
6735		ictx = NULL;
6736		goto retry_extend;
6737	}
6738ictx_done:
6739	/*
6740	 * There is not enough space in the mft record.
6741	 *
6742	 * We need to add an attribute list attribute if it is not already
6743	 * present.
6744	 */
6745	if (!NInoAttrList(base_ni)) {
6746		err = ntfs_attr_list_add(base_ni, base_m, actx);
6747		if (err || actx->is_error) {
6748			if (!err)
6749				err = actx->error;
6750			ntfs_error(vol->mp, "Failed to %s mft_no 0x%llx (error "
6751					"%d).", actx->is_error ?
6752					"remap extent mft record of" :
6753					"add attribute list attribute to",
6754					(unsigned long long)base_ni->mft_no,
6755					err);
6756			goto undo;
6757		}
6758		/*
6759		 * The attribute location will have changed so update it from
6760		 * the search context.
6761		 */
6762		m = actx->m;
6763		a = actx->a;
6764		/*
6765		 * Retry the original attribute record resize as we may now
6766		 * have enough space to create the complete remaining mapping
6767		 * pairs array in the moved attribute record.
6768		 *
6769		 * This can for example happen when the attribute was moved out
6770		 * to an extent mft record which has much more free space than
6771		 * the base mft record had.
6772		 */
6773		goto retry_attr_rec_resize;
6774	}
6775	/*
6776	 * If the attribute record is in an extent mft record we know the
6777	 * attribute can be outside the base mft record (as it already is) thus
6778	 * we can simply resize the attribute to the maximum size possible and
6779	 * then proceed to fill it with mapping pairs data until it is full,
6780	 * then start a new extent in a new mft record, etc, until all runlist
6781	 * elements have been saved in mapping pairs arrays.
6782	 */
6783	if (m != base_m) {
6784		ATTR_LIST_ENTRY *al_entry;
6785		unsigned new_al_size;
6786
6787		/*
6788		 * If the attribute record is not the only one in the extent
6789		 * mft record then move it to a new extent mft record as that
6790		 * will allow the attribute record to grow larger thus reducing
6791		 * the total number of extent attribute records needed to a
6792		 * minimum.
6793		 */
6794		if (!ntfs_attr_record_is_only_one(m, a)) {
6795move_attr:
6796			lck_rw_lock_shared(&base_ni->attr_list_rl.lock);
6797			err = ntfs_attr_record_move(actx);
6798			lck_rw_unlock_shared(&base_ni->attr_list_rl.lock);
6799			if (err) {
6800				if (start < 0 || start >= alloc_size)
6801					ntfs_error(vol->mp, "Failed to move "
6802							"attribute extent "
6803							"from mft record "
6804							"0x%llx to an extent "
6805							"mft record (error "
6806							"%d).",
6807							(unsigned long long)
6808							actx->ni->mft_no, err);
6809				goto undo;
6810			}
6811			/*
6812			 * The attribute location will have changed so update
6813			 * it from the search context.
6814			 */
6815			m = actx->m;
6816			a = actx->a;
6817			/*
6818			 * Retry the original attribute record resize as we may
6819			 * now have enough space to create the complete
6820			 * remaining mapping pairs array in the moved attribute
6821			 * record.
6822			 */
6823			goto retry_attr_rec_resize;
6824		}
6825		max_size = (le32_to_cpu(m->bytes_allocated) -
6826				le32_to_cpu(m->bytes_in_use)) & ~7;
6827add_mapping_pairs_to_attr:
6828		max_size += attr_len - mp_ofs;
6829		err = ntfs_attr_record_resize(m, a, max_size + mp_ofs);
6830		/*
6831		 * We worked out the exact size we can extend to so the resize
6832		 * cannot fail.
6833		 */
6834		if (err)
6835			panic("%s(): err (ntfs_attr_record_resize())\n",
6836					__FUNCTION__);
6837		/*
6838		 * If the new size and the old size are the same we cannot add
6839		 * anything to this extent so do not bother rebuilding the
6840		 * mapping pairs array and go straight to creating the next
6841		 * extent.
6842		 */
6843		if (attr_len == le32_to_cpu(a->length)) {
6844start_new_attr:
6845			stop_vcn = sle64_to_cpu(a->highest_vcn) + 1;
6846			goto skip_mpa_build;
6847		}
6848build_mpa:
6849		mp_rebuilt = TRUE;
6850		/* Generate the mapping pairs directly into the attribute. */
6851		err = ntfs_mapping_pairs_build(vol, (s8*)a + mp_ofs,
6852				le32_to_cpu(a->length) - mp_ofs, rl,
6853				lowest_vcn, -1, &stop_vcn);
6854		if (err && err != ENOSPC) {
6855			if (start < 0 || start >= alloc_size)
6856				ntfs_error(vol->mp, "Cannot extend allocation "
6857						"of mft_no 0x%llx, attribute "
6858						"type 0x%x, because building "
6859						"the mapping pairs array "
6860						"failed (error %d).",
6861						(unsigned long long)ni->mft_no,
6862						(unsigned)le32_to_cpu(ni->type),
6863						err);
6864			err = EIO;
6865			/*
6866			 * Need to set @a->highest_vcn to enable correct error
6867			 * recovery.
6868			 */
6869			if (!is_first)
6870				a->highest_vcn = cpu_to_sle64(sle64_to_cpu(
6871						a->lowest_vcn) - 1);
6872			goto undo;
6873		}
6874		/* Update the highest_vcn. */
6875		a->highest_vcn = cpu_to_sle64(stop_vcn - 1);
6876		/*
6877		 * We have finished with this extent so update the current
6878		 * allocated size and attribute length to reflect this.  We
6879		 * need to do this to enable error handling and recovery.
6880		 */
6881		alloc_size = stop_vcn << vol->cluster_size_shift;
6882		attr_len = le32_to_cpu(a->length);
6883		/*
6884		 * If the mapping pairs build succeeded, i.e. the current
6885		 * attribute extent contains the end of the runlist, we are
6886		 * done and only need to update the attribute sizes in the base
6887		 * attribute extent so go and do that.
6888		 */
6889		if (!err)
6890			goto update_sizes;
6891		/*
6892		 * We have finished with this extent mft record thus we release
6893		 * it after ensuring the changes make it to disk later.  We do
6894		 * this by hand as we want to keep the current attribute list
6895		 * attribute entry as we will be inserting the entry for the
6896		 * next attribute extent immediately after it.
6897		 */
6898		NInoSetMrecNeedsDirtying(actx->ni);
6899skip_mpa_build:
6900		/* Get the size of the remaining mapping pairs array. */
6901		rl = ntfs_rl_find_vcn_nolock(rl, stop_vcn);
6902		if (!rl)
6903			panic("%s(): !rl (skip_mpa_build)\n", __FUNCTION__);
6904		if (!rl->length)
6905			panic("%s(): !rl->length (skip_mpa_build)\n",
6906					__FUNCTION__);
6907		if (rl->lcn < LCN_HOLE)
6908			panic("%s(): rl->lcn < LCN_HOLE (skip_mpa_build)\n",
6909					__FUNCTION__);
6910		err = ntfs_get_size_for_mapping_pairs(vol, rl, stop_vcn, -1,
6911				&mp_size);
6912		if (err) {
6913			if (start < 0 || start >= alloc_size)
6914				ntfs_error(vol->mp, "Cannot complete "
6915						"extension of allocation of "
6916						"mft_no 0x%llx, attribute type "
6917						"0x%x, because determining "
6918						"the size for the mapping "
6919						"pairs failed (error %d).",
6920						(unsigned long long)ni->mft_no,
6921						(unsigned)le32_to_cpu(ni->type),
6922						err);
6923			err = EIO;
6924			goto undo;
6925		}
6926		/* We only release extent mft records. */
6927		if (actx->ni != base_ni)
6928			ntfs_extent_mft_record_unmap(actx->ni);
6929		/*
6930		 * We now need to allocate a new extent mft record, attach it
6931		 * to the base ntfs inode and set up the search context to
6932		 * point to it, then create a new attribute extent in it of
6933		 * either maximum size or the left to do mapping pairs size and
6934		 * then build the mapping pairs array in it.  Finally, add an
6935		 * attribute list attribute entry for the new attribute extent.
6936		 */
6937		err = ntfs_mft_record_alloc(vol, NULL, NULL, base_ni,
6938				&actx->ni, &m, &a);
6939		if (err) {
6940			/*
6941			 * Make it safe to release the attribute search
6942			 * context.
6943			 */
6944			actx->ni = base_ni;
6945			if (start < 0 || start >= alloc_size)
6946				ntfs_error(vol->mp, "Cannot extend allocation "
6947						"of mft_no 0x%llx, attribute "
6948						"type 0x%x, because "
6949						"allocating a new extent mft "
6950						"record failed (error %d),",
6951						(unsigned long long)ni->mft_no,
6952						(unsigned)le32_to_cpu(ni->type),
6953						err);
6954			goto undo;
6955		}
6956		actx->m = m;
6957		actx->a = a;
6958		/* We are no longer working on the extent we started with. */
6959		is_first = FALSE;
6960		/*
6961		 * Get the size needed for the remaining mapping pairs array
6962		 * and make space for an attribute large enough to hold it.  If
6963		 * there is not enough space to do so make the maximum amount
6964		 * of space available.
6965		 */
6966		lowest_vcn = stop_vcn;
6967		/*
6968		 * Calculate the offset into the new attribute at which the
6969		 * mapping pairs array begins.  The mapping pairs array is
6970		 * placed after the name aligned to an 8-byte boundary which in
6971		 * turn is placed immediately after the non-resident attribute
6972		 * record itself.
6973		 */
6974		mp_ofs = offsetof(ATTR_RECORD, compressed_size) + ((name_size +
6975				7) & ~7);
6976		err = ntfs_attr_record_make_space(m, a, mp_ofs + mp_size);
6977		if (err) {
6978			if (err != ENOSPC)
6979				panic("%s(): err != ENOSPC\n", __FUNCTION__);
6980			max_size = (le32_to_cpu(m->bytes_allocated) -
6981					le32_to_cpu(m->bytes_in_use)) & ~7;
6982			if (max_size < mp_ofs)
6983				panic("%s(): max_size < mp_ofs\n",
6984						__FUNCTION__);
6985			err = ntfs_attr_record_make_space(m, a, max_size);
6986			/*
6987			 * We worked out the exact maximum size so the call
6988			 * cannot fail.
6989			 */
6990			if (err)
6991				panic("%s(): err ("
6992						"ntfs_attr_record_make_space()"
6993						")\n", __FUNCTION__);
6994		}
6995		/*
6996		 * Now setup the new attribute record.  The entire attribute
6997		 * has been zeroed and the length of the attribute record has
6998		 * been set.
6999		 *
7000		 * Before we proceed with setting up the attribute, add an
7001		 * attribute list attribute entry for the created attribute
7002		 * extent.
7003		 */
7004		al_entry = actx->al_entry = (ATTR_LIST_ENTRY*)(
7005				(u8*)actx->al_entry +
7006				le16_to_cpu(actx->al_entry->length));
7007		al_entry_len = (offsetof(ATTR_LIST_ENTRY, name) + name_size +
7008				7) & ~7;
7009		new_al_size = base_ni->attr_list_size + al_entry_len;
7010		/* Out of bounds checks. */
7011		if ((u8*)al_entry < base_ni->attr_list || (u8*)al_entry >
7012				base_ni->attr_list + new_al_size ||
7013				(u8*)al_entry + al_entry_len >
7014				base_ni->attr_list + new_al_size) {
7015			/* Inode is corrupt. */
7016			if (start < 0 || start >= alloc_size)
7017				ntfs_error(vol->mp, "Cannot complete "
7018						"extension of allocation of "
7019						"mft_no 0x%llx, attribute type "
7020						"0x%x, because the inode is "
7021						"corrupt.  Run chkdsk.",
7022						(unsigned long long)ni->mft_no,
7023						(unsigned)
7024						le32_to_cpu(ni->type));
7025			err = EIO;
7026			goto free_undo;
7027		}
7028		err = ntfs_attr_size_bounds_check(vol, AT_ATTRIBUTE_LIST,
7029				new_al_size);
7030		if (err) {
7031			if (err == ERANGE) {
7032				if (start < 0 || start >= alloc_size)
7033					ntfs_error(vol->mp, "Cannot complete "
7034							"extension of "
7035							"allocation of mft_no "
7036							"0x%llx, attribute "
7037							"type 0x%x, because "
7038							"the attribute list "
7039							"attribute would "
7040							"become to large.  "
7041							"You need to "
7042							"defragment your "
7043							"volume and then try "
7044							"again.",
7045							(unsigned long long)
7046							ni->mft_no, (unsigned)
7047							le32_to_cpu(ni->type));
7048				err = ENOSPC;
7049			} else {
7050				if (start < 0 || start >= alloc_size)
7051					ntfs_error(vol->mp, "Cannot complete "
7052							"extension of "
7053							"allocation of mft_no "
7054							"0x%llx, attribute "
7055							"type 0x%x, because "
7056							"the attribute list "
7057							"attribute is unknown "
7058							"on the volume.  The "
7059							"volume is corrupt.  "
7060							"Run chkdsk.",
7061							(unsigned long long)
7062							ni->mft_no, (unsigned)
7063							le32_to_cpu(ni->type));
7064				NVolSetErrors(vol);
7065				err = EIO;
7066			}
7067			goto free_undo;
7068		}
7069		/*
7070		 * Reallocate the memory buffer if needed and create space for
7071		 * the new entry.
7072		 */
7073		new_al_alloc = (new_al_size + NTFS_ALLOC_BLOCK - 1) &
7074				~(NTFS_ALLOC_BLOCK - 1);
7075		if (new_al_alloc > base_ni->attr_list_alloc) {
7076			u8 *tmp, *al, *al_end;
7077			unsigned al_entry_ofs;
7078
7079			tmp = OSMalloc(new_al_alloc, ntfs_malloc_tag);
7080			if (!tmp) {
7081				if (start < 0 || start >= alloc_size)
7082					ntfs_error(vol->mp, "Cannot complete "
7083							"extension of "
7084							"allocation of mft_no "
7085							"0x%llx, attribute "
7086							"type 0x%x, because "
7087							"there is not enough "
7088							"memory to extend "
7089							"the attribute list "
7090							"attribute.",
7091							(unsigned long long)
7092							ni->mft_no, (unsigned)
7093							le32_to_cpu(ni->type));
7094				err = ENOMEM;
7095				goto free_undo;
7096			}
7097			al = base_ni->attr_list;
7098			al_entry_ofs = (u8*)al_entry - al;
7099			al_end = al + base_ni->attr_list_size;
7100			memcpy(tmp, al, al_entry_ofs);
7101			if ((u8*)al_entry < al_end)
7102				memcpy(tmp + al_entry_ofs + al_entry_len,
7103						al + al_entry_ofs,
7104						base_ni->attr_list_size -
7105						al_entry_ofs);
7106			al_entry = actx->al_entry = (ATTR_LIST_ENTRY*)(tmp +
7107					al_entry_ofs);
7108			OSFree(base_ni->attr_list, base_ni->attr_list_alloc,
7109					ntfs_malloc_tag);
7110			base_ni->attr_list_alloc = new_al_alloc;
7111			base_ni->attr_list = tmp;
7112		} else if ((u8*)al_entry < base_ni->attr_list +
7113				base_ni->attr_list_size)
7114			memmove((u8*)al_entry + al_entry_len, al_entry,
7115					base_ni->attr_list_size -
7116					((u8*)al_entry - base_ni->attr_list));
7117		base_ni->attr_list_size = new_al_size;
7118		/* Set up the attribute extent and the attribute list entry. */
7119		al_entry->type = a->type = ni->type;
7120		al_entry->length = cpu_to_le16(al_entry_len);
7121		a->non_resident = 1;
7122		al_entry->name_length = a->name_length = ni->name_len;
7123		a->name_offset = const_cpu_to_le16(offsetof(ATTR_RECORD,
7124				compressed_size));
7125		al_entry->name_offset = offsetof(ATTR_LIST_ENTRY, name);
7126		al_entry->instance = a->instance = m->next_attr_instance;
7127		/*
7128		 * Increment the next attribute instance number in the mft
7129		 * record as we consumed the old one.
7130		 */
7131		m->next_attr_instance = cpu_to_le16((le16_to_cpu(
7132				m->next_attr_instance) + 1) & 0xffff);
7133		al_entry->lowest_vcn = a->lowest_vcn =
7134				cpu_to_sle64(lowest_vcn);
7135		al_entry->mft_reference = MK_LE_MREF(actx->ni->mft_no,
7136				actx->ni->seq_no);
7137		a->mapping_pairs_offset = cpu_to_le16(mp_ofs);
7138		/* Copy the attribute name into place. */
7139		if (name_size) {
7140			memcpy((u8*)a + offsetof(ATTR_RECORD, compressed_size),
7141					ni->name, name_size);
7142			memcpy(&al_entry->name, ni->name, name_size);
7143		}
7144		/* For tidyness, zero out the unused space. */
7145		if (al_entry_len > offsetof(ATTR_LIST_ENTRY, name) + name_size)
7146			memset((u8*)al_entry +
7147					offsetof(ATTR_LIST_ENTRY, name) +
7148					name_size, 0, al_entry_len -
7149					(offsetof(ATTR_LIST_ENTRY, name) +
7150					name_size));
7151		/*
7152		 * Need to set @a->highest_vcn to enable correct error
7153		 * recovery.
7154		 */
7155		a->highest_vcn = cpu_to_sle64(lowest_vcn - 1);
7156		/*
7157		 * Extend the attribute list attribute and copy in the modified
7158		 * value from the cache.
7159		 */
7160		err = ntfs_attr_list_sync_extend(base_ni, base_m,
7161				(u8*)al_entry - base_ni->attr_list, actx);
7162		if (err || actx->is_error) {
7163			/*
7164			 * If @actx->is_error indicates error this is fatal as
7165			 * we cannot build the mapping pairs array into it as
7166			 * it is not mapped.
7167			 *
7168			 * However, we may still be able to recover from this
7169			 * situation by freeing the extent mft record and thus
7170			 * deleting the attribute record.  This only works when
7171			 * this is the only attribute record in the mft record
7172			 * and when we just created this extent attribute
7173			 * record.  We can easily determine if this is the only
7174			 * attribute in the mft record by scanning through the
7175			 * cached attribute list attribute.
7176			 */
7177			if (!err)
7178				err = actx->error;
7179			ntfs_error(vol->mp, "Failed to %s mft_no 0x%llx (error "
7180					"%d).", actx->is_error ?
7181					"remap extent mft record of" :
7182					"extend and sync attribute list "
7183					"attribute to",
7184					(unsigned long long)base_ni->mft_no,
7185					err);
7186			goto undo;
7187		}
7188		/*
7189		 * Finally, proceed to building the mapping pairs array into
7190		 * the attribute record.
7191		 */
7192		goto build_mpa;
7193	}
7194	/*
7195	 * We now know that the attribute is in the base mft record.
7196	 *
7197	 * For performance reasons we want to keep the first extent of the
7198	 * unnamed $DATA attribute of files and the $I30 named
7199	 * $INDEX_ALLOCATION and $BITMAP attributes of directories in the base
7200	 * mft record even if this means that the first extent will be nearly
7201	 * empty.  This ensures that loading an inode is faster and thus stat()
7202	 * and getattrlist() will be faster.
7203	 *
7204	 * If the attribute is one of the above described ones then we keep the
7205	 * existing extent as it is (unless it is actually empty in which case
7206	 * we add at least some mapping data to it) and start a new extent in a
7207	 * new extent mft record.
7208	 *
7209	 * In all other cases we move the attribute to a new extent mft record
7210	 * and retry the attribute resize as it may now fit.
7211	 */
7212	if (a->lowest_vcn || (!S_ISDIR(base_ni->mode) &&
7213			(ni->type != AT_DATA || ni->name_len)) ||
7214			(S_ISDIR(base_ni->mode) &&
7215			(!ni->name_len || ni->name != I30)))
7216		goto move_attr;
7217	max_size = (le32_to_cpu(m->bytes_allocated) -
7218			le32_to_cpu(m->bytes_in_use)) & ~7;
7219	al_entry_len = le16_to_cpu(actx->al_entry->length);
7220	/*
7221	 * A single mapping pair can be up to 17 bytes in size so we need at
7222	 * least that much free space.  But we need to align the attribute
7223	 * length to 8 bytes thus the 17 becomes 24.
7224	 *
7225	 * Further, we will be adding at least one attribute list attribute
7226	 * entry thus we want to definitely have space for that to happen.  If
7227	 * the attribute list attribute is non-resident we may have to add
7228	 * another mapping pair which would as above be 24 bytes or if it is
7229	 * resident we would have to add an actual attribute list entry which
7230	 * would be the same size as the one for the current attribute record.
7231	 * As this is guaranteed to be larger than 24 bytes we use the larger
7232	 * size as the minimum to leave free.
7233	 *
7234	 * Thus the minimum of free space we require before adding any mapping
7235	 * pairs to the current attribute record is 24 + @al_entry_len.
7236	 *
7237	 * There may be a lot of free space so it would be silly to only use
7238	 * the minimum.  On one hand we would like to consume as much of the
7239	 * free space as possible to keep the number of attribute extents to a
7240	 * minimum.  On the other hand we would like to keep enough spare space
7241	 * for four attribute list attribute entries (this is an arbitrary
7242	 * choice) to simplify future expansion of the attribute list
7243	 * attribute.
7244	 */
7245	if (!*((u8*)a + mp_ofs)) {
7246		/*
7247		 * There are no mapping pairs in this attribute record thus we
7248		 * either have to add some mapping pairs or if the available
7249		 * space is less than our minimum we have to move the attribute
7250		 * record out into a new extent mft record.
7251		 */
7252		if (max_size < 24 + al_entry_len)
7253			goto move_attr;
7254		/*
7255		 * We have our minimum amount of space and possibly a lot more.
7256		 * If we have less than our desired spare space use our minimum
7257		 * and if we have more than that use everything except the
7258		 * desired spare space.
7259		 */
7260		if (max_size < 24 + (4 * al_entry_len))
7261			max_size = 24;
7262		else
7263			max_size -= 4 * al_entry_len;
7264	} else {
7265		/*
7266		 * Check if it would be sensible to add at least some mapping
7267		 * pairs to the current attribute record.
7268		 *
7269		 * If the amount of free space is less than the desired spare
7270		 * space we leave this attribute record be and start a new
7271		 * extent and if we have more than that use everything except
7272		 * the desired spare space.
7273		 */
7274		if (max_size < 24 + (4 * al_entry_len))
7275			goto start_new_attr;
7276		max_size -= 4 * al_entry_len;
7277	}
7278	/*
7279	 * We want to add some mapping pairs to the current attribute before
7280	 * starting the next one.
7281	 *
7282	 * @max_size is already set to the number of bytes to consume from the
7283	 * free space in the mft record and it is guaranteed that the mft
7284	 * record has at least that much free space.
7285	 */
7286	goto add_mapping_pairs_to_attr;
7287update_sizes:
7288	/*
7289	 * We now have extended the allocated size of the attribute.  Reflect
7290	 * this in the ntfs_inode structure and the attribute record.
7291	 */
7292	if (a->lowest_vcn) {
7293		/*
7294		 * We are not in the first attribute extent, switch to it, but
7295		 * first ensure the changes will make it to disk later.
7296		 */
7297		NInoSetMrecNeedsDirtying(actx->ni);
7298		ntfs_attr_search_ctx_reinit(actx);
7299		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 0,
7300				NULL, 0, actx);
7301		if (err) {
7302			if (start < 0 || start >= alloc_size)
7303				ntfs_error(vol->mp, "Cannot complete "
7304						"extension of allocation of "
7305						"mft_no 0x%llx, attribute type "
7306						"0x%x, because lookup of "
7307						"first attribute extent "
7308						"failed (error %d).",
7309						(unsigned long long)
7310						base_ni->mft_no, (unsigned)
7311						le32_to_cpu(ni->type), err);
7312			if (err == ENOENT)
7313				err = EIO;
7314			goto undo_do_trunc;
7315		}
7316		/* @m is not used any more so no need to set it. */
7317		a = actx->a;
7318	}
7319	/*
7320	 * If we created a hole and the attribute is not marked as sparse, mark
7321	 * it as sparse now.
7322	 */
7323	if (is_sparse && !NInoSparse(ni)) {
7324		err = ntfs_attr_sparse_set(base_ni, ni, actx);
7325		if (err) {
7326			ntfs_error(vol->mp, "Failed to set the attribute to "
7327					"be sparse (error %d).", err);
7328			goto undo_do_trunc;
7329		}
7330		/*
7331		 * The attribute may have been moved to make space for the
7332		 * compressed size so @a is now invalid.
7333		 */
7334		a = actx->a;
7335	}
7336	lck_spin_lock(&ni->size_lock);
7337	ni->allocated_size = new_alloc_size;
7338	a->allocated_size = cpu_to_sle64(new_alloc_size);
7339	if (NInoSparse(ni) || (ni->type != AT_INDEX_ALLOCATION &&
7340			NInoCompressed(ni))) {
7341		ni->compressed_size += nr_allocated << vol->cluster_size_shift;
7342		a->compressed_size = cpu_to_sle64(ni->compressed_size);
7343	}
7344	lck_spin_unlock(&ni->size_lock);
7345	if (ni->name == I30 && ni->type == AT_INDEX_ALLOCATION) {
7346		lck_spin_lock(&base_ni->size_lock);
7347		base_ni->allocated_size = new_alloc_size;
7348		lck_spin_unlock(&base_ni->size_lock);
7349	}
7350alloc_done:
7351	if (new_data_size > sle64_to_cpu(a->data_size)) {
7352		if (!ubc_setsize(ni->vn, new_data_size)) {
7353			ntfs_error(vol->mp, "Failed to set size in UBC.");
7354			/*
7355			 * This can only happen if a previous resize failed and
7356			 * the UBC size was already out of date in which case
7357			 * we can just leave it out of date and continue to
7358			 * completion returning an error.  FIXME: We could roll
7359			 * back the changes to the metadata at some point but
7360			 * it does not seem worth it at the moment given that
7361			 * the error can only happen if there already was an
7362			 * error thus it is very unlikely.
7363			 */
7364			err = EIO;
7365		}
7366		lck_spin_lock(&ni->size_lock);
7367		ni->data_size = new_data_size;
7368		a->data_size = cpu_to_sle64(new_data_size);
7369		lck_spin_unlock(&ni->size_lock);
7370		if (ni->name == I30 && ni->type == AT_INDEX_ALLOCATION) {
7371			lck_spin_lock(&base_ni->size_lock);
7372			base_ni->data_size = new_data_size;
7373			lck_spin_unlock(&base_ni->size_lock);
7374		}
7375	}
7376dirty_done:
7377	/* Ensure the changes make it to disk. */
7378	NInoSetMrecNeedsDirtying(actx->ni);
7379	/*
7380	 * We have modified the size.  If the ntfs inode is the base inode,
7381	 * cause the sizes to be written to all the directory index entries
7382	 * pointing to the base inode when the inode is written to disk.  Do
7383	 * not do this for directories as they have both sizes set to zero in
7384	 * their index entries.
7385	 */
7386	if (ni == base_ni && !S_ISDIR(ni->mode))
7387		NInoSetDirtySizes(ni);
7388done:
7389	ntfs_attr_search_ctx_put(actx);
7390	ntfs_mft_record_unmap(base_ni);
7391	lck_rw_unlock_exclusive(&ni->rl.lock);
7392	ntfs_debug("Done, new_allocated_size 0x%llx.",
7393			(unsigned long long)new_alloc_size);
7394	if (dst_alloc_size)
7395		*dst_alloc_size = new_alloc_size;
7396	return err;
7397free_undo:
7398	/* We have not yet added an attribute list entry for the new extent. */
7399	al_entry_added = FALSE;
7400	goto free_extent;
7401undo:
7402	ntfs_attr_search_ctx_reinit(actx);
7403	if (is_first && !mp_rebuilt)
7404		goto undo_alloc;
7405	/* Look up the attribute extent we were working on. */
7406	if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, lowest_vcn,
7407			NULL, 0, actx)) {
7408		/* There is nothing we can do now, bail out. */
7409		ntfs_error(vol->mp, "Failed to find current attribute extent "
7410				"in error code path.  Leaving inconsistent "
7411				"metadata.  Run chkdsk.");
7412		NVolSetErrors(vol);
7413		goto err_out;
7414	}
7415	if (is_first)
7416		actx->a->highest_vcn = cpu_to_sle64(
7417				(alloc_size >> vol->cluster_size_shift) - 1);
7418undo_alloc:
7419	ll = alloc_size >> vol->cluster_size_shift;
7420	if (ntfs_cluster_free(ni, ll, -1, actx, &nr_freed)) {
7421		ntfs_error(vol->mp, "Failed to release allocated cluster(s) "
7422				"in error code path.  Run chkdsk to recover "
7423				"the lost cluster(s).");
7424		NVolSetErrors(vol);
7425		/*
7426		 * Still need to know how many real clusters are effectively
7427		 * truncated from the attribute extentsion.
7428		 */
7429		nr_freed = ntfs_rl_get_nr_real_clusters(&ni->rl, ll, -1);
7430	}
7431	m = actx->m;
7432	a = actx->a;
7433undo_hole:
7434	/*
7435	 * If the runlist truncation fails and/or the search context is no
7436	 * longer valid, we cannot resize the attribute record or build the
7437	 * mapping pairs array thus we mark the volume dirty and tell the user
7438	 * to run chkdsk.
7439	 */
7440	if (ntfs_rl_truncate_nolock(vol, &ni->rl, ll) || actx->is_error) {
7441		ntfs_error(vol->mp, "Failed to %s in error code path.  Run "
7442				"chkdsk.", actx->is_error ?
7443				"restore attribute search context" :
7444				"truncate attribute runlist");
7445		NVolSetErrors(vol);
7446	} else if (is_first) {
7447		if (mp_rebuilt) {
7448			/* We are working on the original extent, restore it. */
7449			if (ntfs_attr_record_resize(m, a, attr_len)) {
7450				ntfs_error(vol->mp, "Failed to restore "
7451						"attribute record in error "
7452						"code path.  Run chkdsk.");
7453				NVolSetErrors(vol);
7454			} else /* if (success) */ {
7455				mp_ofs = le16_to_cpu(a->mapping_pairs_offset);
7456				if (ntfs_mapping_pairs_build(vol, (s8*)a +
7457						mp_ofs, attr_len - mp_ofs,
7458						ni->rl.rl, lowest_vcn, -1,
7459						NULL)) {
7460					ntfs_error(vol->mp, "Failed to "
7461							"restore mapping "
7462							"pairs array in error "
7463							"code path.  Run "
7464							"chkdsk.");
7465					NVolSetErrors(vol);
7466				}
7467				if (actx->ni != base_ni)
7468					NInoSetMrecNeedsDirtying(actx->ni);
7469			}
7470		}
7471	} else if (/* !is_first && */ a->highest_vcn ==
7472			cpu_to_sle64(sle64_to_cpu(a->lowest_vcn) - 1)) {
7473		/* We need to delete the attribute list entry, too. */
7474		al_entry_added = TRUE;
7475		/* We are working on a new extent, remove it. */
7476		if (!ntfs_attr_record_is_only_one(m, a)) {
7477			ntfs_attr_record_delete_internal(m, a);
7478			if (actx->ni != base_ni)
7479				NInoSetMrecNeedsDirtying(actx->ni);
7480		} else {
7481free_extent:
7482			if (!ntfs_extent_mft_record_free(base_ni, actx->ni,
7483					m)) {
7484				/*
7485				 * The extent inode no longer exists.  Make it
7486				 * safe to release/reinit the search context.
7487				 */
7488				actx->ni = base_ni;
7489			} else {
7490				ntfs_error(vol->mp, "Failed to free extent "
7491						"mft record 0x%llx of mft_no "
7492						"0x%llx in error code path.  "
7493						"Leaving inconsistent "
7494						"metadata.  Run chkdsk.",
7495						(unsigned long long)
7496						actx->ni->mft_no,
7497						(unsigned long long)
7498						base_ni->mft_no);
7499				NVolSetErrors(vol);
7500			}
7501		}
7502		if (al_entry_added) {
7503			ntfs_attr_list_entry_delete(base_ni, actx->al_entry);
7504			ntfs_attr_search_ctx_reinit(actx);
7505			if (ntfs_attr_list_sync_shrink(base_ni, 0, actx)) {
7506				ntfs_error(vol->mp, "Failed to restore "
7507						"attribute list attribute in "
7508						"base inode 0x%llx.  Leaving "
7509						"inconsistent metadata.  "
7510						"Run chkdsk.",
7511						(unsigned long long)
7512						base_ni->mft_no);
7513				NVolSetErrors(vol);
7514			}
7515		}
7516	}
7517undo_do_trunc:
7518	lck_spin_lock(&ni->size_lock);
7519	if (alloc_size == ni->allocated_size) {
7520		lck_spin_unlock(&ni->size_lock);
7521		goto undo_skip_update_sizes;
7522	}
7523	lck_spin_unlock(&ni->size_lock);
7524	ntfs_attr_search_ctx_reinit(actx);
7525	/* Look up the first attribute extent. */
7526	if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 0, NULL, 0,
7527			actx)) {
7528		/* There is nothing we can do now, bail out. */
7529		ntfs_error(vol->mp, "Failed to find first attribute extent in "
7530				"error code path.  Leaving inconsistent "
7531				"metadata.  Run chkdsk.");
7532		NVolSetErrors(vol);
7533		goto err_out;
7534	}
7535	a = actx->a;
7536	lck_spin_lock(&ni->size_lock);
7537	ni->allocated_size = alloc_size;
7538	a->allocated_size = cpu_to_sle64(alloc_size);
7539	if (NInoSparse(ni) || (ni->type != AT_INDEX_ALLOCATION &&
7540			NInoCompressed(ni))) {
7541		ni->compressed_size += (nr_allocated - nr_freed) <<
7542				vol->cluster_size_shift;
7543		a->compressed_size = cpu_to_sle64(ni->compressed_size);
7544	}
7545	lck_spin_unlock(&ni->size_lock);
7546	if (ni->name == I30 && ni->type == AT_INDEX_ALLOCATION) {
7547		lck_spin_lock(&base_ni->size_lock);
7548		base_ni->allocated_size = alloc_size;
7549		lck_spin_unlock(&base_ni->size_lock);
7550	}
7551	/* Ensure the changes make it to disk. */
7552	if (actx->ni != base_ni)
7553		NInoSetMrecNeedsDirtying(actx->ni);
7554	/*
7555	 * We have modified the size.  If the ntfs inode is the base inode,
7556	 * cause the sizes to be written to all the directory index entries
7557	 * pointing to the base inode when the inode is written to disk.  Do
7558	 * not do this for directories as they have both sizes set to zero in
7559	 * their index entries.
7560	 */
7561	if (ni == base_ni && !S_ISDIR(ni->mode))
7562		NInoSetDirtySizes(ni);
7563undo_skip_update_sizes:
7564	ntfs_attr_search_ctx_put(actx);
7565	NInoSetMrecNeedsDirtying(base_ni);
7566	ntfs_mft_record_unmap(base_ni);
7567	lck_rw_unlock_exclusive(&ni->rl.lock);
7568	/*
7569	 * Things are now consistent, try to truncate the attribute back to its
7570	 * old size which will cause the allocation to be restored to its old
7571	 * size.
7572	 *
7573	 * TODO: We should support partial allocations and when we do so we
7574	 * should only put the allocated size back if the error was not ENOSPC
7575	 * and partial allocations are acceptable for this attribute.  In that
7576	 * case would also need to update @ni->data_size, @a->data_size, and
7577	 * the size in the vnode @ni->vn via ubc_setsize().
7578	 */
7579	if (!is_first) {
7580		lck_spin_lock(&ni->size_lock);
7581		ll = ni->data_size;
7582		lck_spin_unlock(&ni->size_lock);
7583		if (ntfs_attr_resize(ni, ll, 0, ictx)) {
7584			ntfs_error(vol->mp, "Failed to undo partial "
7585					"allocation in inode 0x%llx in error "
7586					"code path.",
7587					(unsigned long long)base_ni->mft_no);
7588			NVolSetErrors(vol);
7589		}
7590	}
7591conv_err_out:
7592	ntfs_debug("Failed (error %d).", err);
7593	return err;
7594err_out:
7595	if (actx)
7596		ntfs_attr_search_ctx_put(actx);
7597	if (base_m)
7598		ntfs_mft_record_unmap(base_ni);
7599	lck_rw_unlock_exclusive(&ni->rl.lock);
7600	goto conv_err_out;
7601trunc_err_out:
7602	mp_rebuilt = FALSE;
7603	if (is_sparse) {
7604		ll = alloc_size >> vol->cluster_size_shift;
7605		/*
7606		 * Silence compiler warning about possible use of uninitalized
7607		 * variable.
7608		 */
7609		attr_len = 0;
7610		goto undo_hole;
7611	}
7612	goto err_out;
7613}
7614
7615/**
7616 * ntfs_attr_resize - called to change the size of an ntfs attribute inode
7617 * @ni:		ntfs inode for which to change the size
7618 * @new_size:	new size in bytes to which to resize the ntfs attribute @ni
7619 * @ioflags:	flags further describing the resize request
7620 * @ictx:	index context or NULL
7621 *
7622 * Resize the attribute described by the ntfs inode @ni to @new_size bytes.
7623 *
7624 * Note: We only support size changes for normal attributes at present, i.e.
7625 * not compressed and not encrypted.
7626 *
7627 * The flags in @ioflags further describe the resize request.  The following
7628 * ioflags are currently defined in OS X kernel (a lot of them are not
7629 * applicable to resize requests however):
7630 *	IO_UNIT		- Do i/o as atomic unit.
7631 *	IO_APPEND	- Append write to end.
7632 *	IO_SYNC		- Do i/o synchronously.
7633 *	IO_NODELOCKED	- Underlying node already locked.
7634 *	IO_NDELAY	- FNDELAY flag set in file table.
7635 *	IO_NOZEROFILL	- F_SETSIZE fcntl uses this to prevent zero filling.
7636 *	IO_TAILZEROFILL	- Zero fills at the tail of write.
7637 *	IO_HEADZEROFILL	- Zero fills at the head of write.
7638 *	IO_NOZEROVALID	- Do not zero fill if valid page.
7639 *	IO_NOZERODIRTY	- Do not zero fill if page is dirty.
7640 *	IO_CLOSE	- The i/o was issued from close path.
7641 *	IO_NOCACHE	- Same effect as VNOCACHE_DATA, but only for this i/o.
7642 *	IO_RAOFF	- Same effect as VRAOFF, but only for this i/o.
7643 *	IO_DEFWRITE	- Defer write if vfs.defwrite is set.
7644 *	IO_PASSIVE	- This is background i/o so do not throttle other i/o.
7645 * In particular the only flags that are used in the kernel when calling
7646 * vnode_setsize() are IO_SYNC and IO_NOZEROFILL.
7647 *
7648 * TODO: The @ioflags are currently ignored.
7649 *
7650 * If @ictx is not NULL, the resize is for an index allocation or bitmap
7651 * attribute extension.  In this case, if there is not enough space in the mft
7652 * record for the extended index allocation/bitmap attribute, the index root is
7653 * moved to an index block if it is not empty to create more space in the mft
7654 * record.
7655 *
7656 * Return 0 on success and errno on error.
7657 *
7658 * Locking: - Caller must hold @ni->lock on the inode for writing.
7659 *	    - If called for a shrinking operation, the tail of the new final
7660 *	      partial page will be zeroed by the call to ubc_setsize() thus it
7661 *	      must not be locked / mapped or the ubc_setsize() call would
7662 *	      deadlock.
7663 */
7664errno_t ntfs_attr_resize(ntfs_inode *ni, s64 new_size, int ioflags,
7665		ntfs_index_context *ictx)
7666{
7667	s64 old_size, nr_freed, new_alloc_size, old_alloc_size, compressed_size;
7668	VCN highest_vcn, old_highest_vcn, lowest_vcn;
7669	ntfs_inode *eni, *base_ni;
7670	ntfs_volume *vol = ni->vol;
7671	ntfs_attr_search_ctx *actx;
7672	MFT_RECORD *m;
7673	ATTR_RECORD *a;
7674	ATTR_LIST_ENTRY *al_entry;
7675	u8 *del_al_start, *al_end;
7676	int size_change, alloc_change;
7677	unsigned mp_size, attr_len, arec_size;
7678	errno_t err;
7679	BOOL need_ubc_setsize = TRUE;
7680	static const char es[] = "  Leaving inconsistent metadata.  Unmount "
7681			"and run chkdsk.";
7682
7683	ntfs_debug("Entering for mft_no 0x%llx.",
7684			(unsigned long long)ni->mft_no);
7685	/*
7686	 * Cannot be called for directory inodes as metadata access happens via
7687	 * the corresponding index inodes.
7688	 */
7689	if (S_ISDIR(ni->mode))
7690		panic("%s(): Called for directory inode.\n", __FUNCTION__);
7691	base_ni = ni;
7692	if (NInoAttr(ni))
7693		base_ni = ni->base_ni;
7694	/*
7695	 * We are going to change the size thus we need the ntfs inode lock
7696	 * taken for exclusive access which is already done by the caller.
7697	 *
7698	 * When shrinking start by changing the size in the UBC of the vnode.
7699	 * This will cause all pages in the VM beyond the new size to be thrown
7700	 * away and the last page to be pushed out to disk and its end
7701	 * invalidated.
7702	 *
7703	 * We guarantee that the size in the UBC in the vnode will always be
7704	 * smaller or equal to the data_size in the ntfs inode thus no need to
7705	 * check the data_size.
7706	 */
7707	old_size = ubc_getsize(ni->vn);
7708	if (new_size < old_size) {
7709		err = ubc_setsize(ni->vn, new_size);
7710		if (!err) {
7711			ntfs_error(vol->mp, "Failed to shrink size in UBC.");
7712			err = EIO;
7713			goto err;
7714		}
7715		need_ubc_setsize = FALSE;
7716	}
7717retry_resize:
7718	/*
7719	 * Lock the runlist for writing and map the mft record to ensure it is
7720	 * safe to modify the attribute runlist and sizes.
7721	 */
7722	lck_rw_lock_exclusive(&ni->rl.lock);
7723	err = ntfs_mft_record_map(base_ni, &m);
7724	if (err) {
7725		ntfs_error(vol->mp, "Failed to map mft record for mft_no "
7726				"0x%llx (error %d).",
7727				(unsigned long long)ni->mft_no, err);
7728		goto unl_err;
7729	}
7730	actx = ntfs_attr_search_ctx_get(base_ni, m);
7731	if (!actx) {
7732		ntfs_error(vol->mp, "Failed to allocate a search context (not "
7733				"enough memory).");
7734		err = ENOMEM;
7735		goto unm_err;
7736	}
7737	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 0, NULL, 0,
7738			actx);
7739	if (err) {
7740		if (err == ENOENT) {
7741			ntfs_error(vol->mp, "Open attribute is missing from "
7742					"mft record.  Inode 0x%llx is "
7743					"corrupt.  Run chkdsk.",
7744					(unsigned long long)ni->mft_no);
7745			err = EIO;
7746		} else
7747			ntfs_error(vol->mp, "Failed to lookup attribute "
7748					"(error %d).", err);
7749		goto put_err;
7750	}
7751	m = actx->m;
7752	a = actx->a;
7753	if (old_size != ntfs_attr_size(a)) {
7754		/*
7755		 * A failed truncate caused the ubc size to get out of sync.
7756		 * The current size of the attribute value is the correct old
7757		 * size.
7758		 */
7759		old_size = ntfs_attr_size(a);
7760	}
7761	/* Calculate the new allocated size. */
7762	if (NInoNonResident(ni))
7763		new_alloc_size = (new_size + vol->cluster_size - 1) &
7764				~(s64)vol->cluster_size_mask;
7765	else
7766		new_alloc_size = (new_size + 7) & ~7;
7767	/* The current allocated size is the old allocated size. */
7768	lck_spin_lock(&ni->size_lock);
7769	old_alloc_size = ni->allocated_size;
7770	compressed_size = ni->compressed_size;
7771	lck_spin_unlock(&ni->size_lock);
7772	/*
7773	 * The change in the file size.  This will be 0 if no change, >0 if the
7774	 * size is growing, and <0 if the size is shrinking.
7775	 */
7776	size_change = -1;
7777	if (new_size - old_size >= 0) {
7778		size_change = 1;
7779		if (new_size == old_size)
7780			size_change = 0;
7781	}
7782	if (need_ubc_setsize && size_change < 0) {
7783		/*
7784		 * A previous truncate failed thus we did not catch that this
7785		 * is a shrinking resize earlier on.
7786		 */
7787		err = ubc_setsize(ni->vn, new_size);
7788		if (!err) {
7789			ntfs_error(vol->mp, "Failed to shrink size in UBC.");
7790			err = EIO;
7791			goto put_err;
7792		}
7793		need_ubc_setsize = FALSE;
7794	}
7795	/* As above for the allocated size. */
7796	alloc_change = -1;
7797	if (new_alloc_size - old_alloc_size >= 0) {
7798		alloc_change = 1;
7799		if (new_alloc_size == old_alloc_size)
7800			alloc_change = 0;
7801	}
7802	/*
7803	 * If neither the size nor the allocation are being changed there is
7804	 * nothing to do.
7805	 */
7806	if (!size_change && !alloc_change)
7807		goto unm_done;
7808	/* If the size is changing, check if new size is allowed in $AttrDef. */
7809	if (size_change) {
7810		err = ntfs_attr_size_bounds_check(vol, ni->type, new_size);
7811		if (err) {
7812			if (err == ERANGE) {
7813				ntfs_error(vol->mp, "Resize would cause the "
7814						"mft_no 0x%llx to %simum size "
7815						"for its attribute type "
7816						"(0x%x).  Aborting resize.",
7817						(unsigned long long)ni->mft_no,
7818						size_change > 0 ? "exceed "
7819						"the max" : "go under the min",
7820						(unsigned)
7821						le32_to_cpu(ni->type));
7822				err = EFBIG;
7823			} else {
7824				ntfs_error(vol->mp, "Mft_no 0x%llx has "
7825						"unknown attribute type "
7826						"0x%x.  Aborting resize.",
7827						(unsigned long long)ni->mft_no,
7828						(unsigned)
7829						le32_to_cpu(ni->type));
7830				err = EIO;
7831			}
7832			goto put_err;
7833		}
7834	}
7835	/*
7836	 * The index root attribute, i.e. directory indexes and index inodes
7837	 * can be marked compressed or encrypted but this means to create
7838	 * compressed/encrypted files, not that the attribute is
7839	 * compressed/encrypted.
7840	 */
7841	if (ni->type != AT_INDEX_ALLOCATION &&
7842			(NInoCompressed(ni) || NInoEncrypted(ni))) {
7843		ntfs_warning(vol->mp, "Changes in inode size are not "
7844				"supported yet for %s attributes, ignoring.",
7845				NInoCompressed(ni) ? "compressed" :
7846				"encrypted");
7847		err = ENOTSUP;
7848		goto put_err;
7849	}
7850	if (a->non_resident)
7851		goto do_non_resident_resize;
7852	if (NInoNonResident(ni))
7853		panic("%s(): NInoNonResident(ni)\n", __FUNCTION__);
7854	arec_size = (le16_to_cpu(a->value_offset) + new_size + 7) & ~7;
7855	/* Resize the attribute record to best fit the new attribute size. */
7856	if (new_size < vol->mft_record_size &&
7857			!ntfs_resident_attr_value_resize(m, a, new_size)) {
7858		/* The resize succeeded! */
7859		NInoSetMrecNeedsDirtying(actx->ni);
7860		lck_spin_lock(&ni->size_lock);
7861		/* Update the sizes in the ntfs inode and all is done. */
7862		ni->allocated_size = le32_to_cpu(a->length) -
7863				le16_to_cpu(a->value_offset);
7864		ni->data_size = le32_to_cpu(a->value_length);
7865		/*
7866		 * Note ntfs_resident_attr_value_resize() has already done any
7867		 * necessary data clearing in the attribute record.  When the
7868		 * file is being shrunk ubc_setsize() will already have zeroed
7869		 * the last partial page, i.e. since this is the resident case
7870		 * this is the page with index 0.  However, when the file is
7871		 * being expanded, the page cache page data between the old
7872		 * data_size, i.e. old_size, and the new_size has not been
7873		 * zeroed.  Fortunately, we do not need to zero it either since
7874		 * on one hand it will either already be zero due to pagein
7875		 * clearing partial page data beyond the data_size in which
7876		 * case there is nothing to do or in the case of the file being
7877		 * mmap()ped at the same time, POSIX specifies that the
7878		 * behaviour is unspecified thus we do not have to do anything.
7879		 * This means that in our implementation in the rare case that
7880		 * the file is mmap()ped and a write occured into the mmap()ped
7881		 * region just beyond the file size and we now extend the file
7882		 * size to incorporate this dirty region outside the file size,
7883		 * a pageout of the page would result in this data being
7884		 * written to disk instead of being cleared.  Given POSIX
7885		 * specifies that this corner case is undefined, we choose to
7886		 * leave it like that as this is much simpler for us as we
7887		 * cannot lock the relevant page now since we are holding too
7888		 * many ntfs locks which would result in lock reversal
7889		 * deadlocks.
7890		 */
7891		ni->initialized_size = new_size;
7892		lck_spin_unlock(&ni->size_lock);
7893		goto unm_done;
7894	}
7895	/* If the above resize failed, this must be an attribute extension. */
7896	if (size_change < 0)
7897		panic("%s(): size_change < 0\n", __FUNCTION__);
7898	/*
7899	 * Not enough space in the mft record.  If this is an index related
7900	 * extension, check if the index root attribute is in the same mft
7901	 * record as the attribute being extended and if it is and it is not
7902	 * empty move its entries into an index allocation block.  Note we do
7903	 * not check whether that actually creates enough space because how
7904	 * much space is needed exactly is very hard to determine in advance
7905	 * (due to potential need for associated attribute list attribute
7906	 * extensions) and also because even if it does not create enough space
7907	 * it will still help and save work later on when working for example
7908	 * on the attribute list attribute.
7909	 */
7910	if (ictx) {
7911		long delta;
7912		INDEX_ROOT *ir;
7913		INDEX_HEADER *ih;
7914		INDEX_ENTRY *ie, *first_ie;
7915		ntfs_index_context *root_ictx;
7916		ntfs_attr_search_ctx root_actx;
7917
7918		/*
7919		 * This must be an index bitmap extension.  An index allocation
7920		 * extension is also possible but not here as that cannot be
7921		 * resident.
7922		 */
7923		if (ni->type != AT_BITMAP)
7924			panic("%s(): ni->type != AT_BITMAP\n", __FUNCTION__);
7925		ntfs_attr_search_ctx_init(&root_actx, actx->ni, m);
7926		err = ntfs_attr_find_in_mft_record(AT_INDEX_ROOT, ni->name,
7927				ni->name_len, NULL, 0, &root_actx);
7928		if (err) {
7929			if (err != ENOENT) {
7930				ntfs_error(vol->mp, "Failed to find index "
7931						"root attribute in mft_no "
7932						"0x%llx (error %d).  Inode is "
7933						"corrupt.  Run chkdsk.",
7934						(unsigned long long)ni->mft_no,
7935						err);
7936				NVolSetErrors(vol);
7937			}
7938			/*
7939			 * The index root is in a different mft record so we
7940			 * cannot gain anything by moving out its entries.  Set
7941			 * @ictx to NULL so we do not waste our time trying
7942			 * again.
7943			 */
7944			ictx = NULL;
7945			goto ictx_done;
7946		}
7947		/*
7948		 * We found the index root in the same mft record as the
7949		 * attribute to be extended.  Check whether it is empty or not.
7950		 */
7951		ir = (INDEX_ROOT*)((u8*)root_actx.a +
7952				le16_to_cpu(root_actx.a->value_offset));
7953		ih = &ir->index;
7954		first_ie = ie = (INDEX_ENTRY*)((u8*)ih +
7955				le32_to_cpu(ih->entries_offset));
7956		while (!(ie->flags & INDEX_ENTRY_END))
7957			ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length));
7958		/*
7959		 * If there are no entries other than the end entry we cannot
7960		 * gain anything by moving out the entries from the index root.
7961		 * Set @ictx to NULL so we do not waste our time trying again.
7962		 */
7963		if (ie == first_ie) {
7964			ictx = NULL;
7965			goto ictx_done;
7966		}
7967		/*
7968		 * We cannot have gotten this far if the current index context
7969		 * is locked and/or it is the index root.
7970		 *
7971		 * Also, we need to drop the runlist lock we are holding as it
7972		 * may need to be taken when moving the entries from the index
7973		 * root to the index allocation block.
7974		 */
7975		if (ictx->is_locked)
7976			panic("%s(): ictx->is_locked\n", __FUNCTION__);
7977		if (ictx->is_root)
7978			panic("%s(): ictx->is_root\n", __FUNCTION__);
7979		lck_rw_unlock_exclusive(&ni->rl.lock);
7980		/* Find the index root by walking up the tree path. */
7981		root_ictx = ictx;
7982		while (!root_ictx->is_root) {
7983			root_ictx = root_ictx->up;
7984			/*
7985			 * If we go all the way round to the beginning without
7986			 * finding the root something has gone badly wrong.
7987			 */
7988			if (root_ictx == ictx)
7989				panic("%s(): root_ictx == ictx\n",
7990						__FUNCTION__);
7991		}
7992		/*
7993		 * We need a proper deallocatable attribute search context thus
7994		 * switch the one pointing to the attribute to be resized to
7995		 * point to the index root.  Note we are not updating
7996		 * @actx->al_entry as this is not going to be touched at all.
7997		 * Having said that set it to NULL just in case.
7998		 */
7999		actx->a = root_actx.a;
8000		actx->al_entry = NULL;
8001		/*
8002		 * Lock the index root node.  We already have the index root
8003		 * attribute thus only need to do the revalidation part of
8004		 * re-locking.
8005		 */
8006		root_ictx->is_locked = 1;
8007		root_ictx->actx = actx;
8008		root_ictx->bytes_free = le32_to_cpu(m->bytes_allocated) -
8009				le32_to_cpu(m->bytes_in_use);
8010		root_ictx->ir = ir;
8011		delta = (u8*)ih - (u8*)root_ictx->index;
8012		if (delta) {
8013			INDEX_ENTRY **entries;
8014			unsigned u;
8015
8016			root_ictx->index = ih;
8017			root_ictx->entry = (INDEX_ENTRY*)(
8018					(u8*)root_ictx->entry + delta);
8019			entries = root_ictx->entries;
8020			for (u = 0; u < root_ictx->nr_entries; u++)
8021				entries[u] = (INDEX_ENTRY*)((u8*)entries[u] +
8022						delta);
8023		}
8024		/*
8025		 * Move the index root entries to an index allocation block.
8026		 *
8027		 * Note we do not need to worry about this causing infinite
8028		 * recursion in the case that we were called from
8029		 * ntfs_index_block_alloc() which was called from
8030		 * ntfs_index_move_root_to_allocation_block() because the
8031		 * latter will have emptied the index root before calling
8032		 * ntfs_index_block_alloc() thus we will bail out above when
8033		 * checking whether the index root is empty the second time
8034		 * round and the recursion will stop there.  This is a very
8035		 * seldom occurence thus there is no point in special casing it
8036		 * in the code in a more efficient but more complicated way.
8037		 *
8038		 * A complication is that ntfs_attr_resize() may have been
8039		 * called from ntfs_index_block_alloc() and in this case when
8040		 * we call ntfs_index_move_root_to_allocation_block() it will
8041		 * call ntfs_index_block_alloc() again which will cause a
8042		 * deadlock (or with lock debugging enabled panic()) because
8043		 * ntfs_index_block_alloc() takes the bitmap inode lock for
8044		 * writing.  To avoid this ntfs_index_block_alloc() sets
8045		 * @ictx->bmp_is_locked and we need to set
8046		 * @root_ictx->bmp_is_locoked to the same value so that when
8047		 * ntfs_index_move_root_to_allocation_block() calls
8048		 * ntfs_index_block_alloc() the latter will know not to take
8049		 * the bitmap inode lock again.
8050		 */
8051		root_ictx->bmp_is_locked = ictx->bmp_is_locked;
8052		err = ntfs_index_move_root_to_allocation_block(root_ictx);
8053		if (root_ictx != ictx)
8054			root_ictx->bmp_is_locked = 0;
8055		if (err) {
8056			ntfs_error(vol->mp, "Failed to move index root to "
8057					"index allocation block (error %d).",
8058					err);
8059			if (root_ictx->is_locked)
8060				ntfs_index_ctx_unlock(root_ictx);
8061			/*
8062			 * This is a disaster as it means the index context is
8063			 * no longer valid thus we have to bail out all the
8064			 * way.
8065			 */
8066			goto err;
8067		}
8068		/* Unlock the newly created index block. */
8069		if (root_ictx->is_root)
8070			panic("%s(): root_ictx->is_root\n", __FUNCTION__);
8071		if (!root_ictx->is_locked)
8072			panic("%s(): !root_ictx->is_locked\n", __FUNCTION__);
8073		ntfs_index_ctx_unlock(root_ictx);
8074		/*
8075		 * We are done.  The index root is now empty thus the mft
8076		 * record should now have enough space.  Because we dropped the
8077		 * mft record when moving the index root entries into the index
8078		 * allocation block we need to restart the attribute resize
8079		 * again.
8080		 *
8081		 * But first we set @ictx to NULL so we do not get here again
8082		 * in the case that there still is not enough free space.  This
8083		 * is not a disaster as we can just carry on doing other
8084		 * rearrangements to free up enough space in the mft record.
8085		 */
8086		ictx = NULL;
8087		goto retry_resize;
8088	}
8089ictx_done:
8090	/*
8091	 * We have to drop all the locks so we can call
8092	 * ntfs_attr_make_non_resident().
8093	 */
8094	ntfs_attr_search_ctx_put(actx);
8095	ntfs_mft_record_unmap(base_ni);
8096	lck_rw_unlock_exclusive(&ni->rl.lock);
8097	/*
8098	 * Not enough space in the mft record, try to make the attribute
8099	 * non-resident and if successful restart the truncation process.
8100	 */
8101	err = ntfs_attr_make_non_resident(ni);
8102	if (!err)
8103		goto retry_resize;
8104	/*
8105	 * Could not make non-resident.  If this is due to this not being
8106	 * permitted for this attribute type try to make other attributes
8107	 * non-resident and/or move this or other attributes out of the mft
8108	 * record this attribute is in.  Otherwise fail.
8109	 */
8110	if (err != EPERM) {
8111		if (err != ENOSPC) {
8112			ntfs_error(vol->mp, "Cannot truncate mft_no 0x%llx, "
8113					"attribute type 0x%x, because the "
8114					"conversion from resident to "
8115					"non-resident attribute failed (error "
8116					"%d).", (unsigned long long)ni->mft_no,
8117					(unsigned)le32_to_cpu(ni->type), err);
8118			if (err != ENOMEM) {
8119				NVolSetErrors(vol);
8120				err = EIO;
8121			}
8122		}
8123		goto err;
8124	}
8125	/*
8126	 * To make space in the mft record we would like to try to make other
8127	 * attributes non-resident if that would save space.
8128	 *
8129	 * FIXME: We cannot do this at present unless the attribute is the
8130	 * attribute being resized as there could be an ntfs inode matching
8131	 * this attribute in memory and it would become out of date with its
8132	 * metadata if we touch its attribute record.
8133	 *
8134	 * FIXME: We do not need to do this if this is the attribute being
8135	 * resized as we already tried to make the attribute non-resident and
8136	 * it did not work or we would never have gotten here in the first
8137	 * place.
8138	 *
8139	 * Thus we have to either move other attributes to extent mft records
8140	 * thus making more space in the base mft record or we have to move the
8141	 * attribute being resized to an extent mft record thus giving it more
8142	 * space.  In any case we need to have an attribute list attribute so
8143	 * start by adding it if it does not yet exist.
8144	 *
8145	 * Before we start, we can check whether it is possible to fit the
8146	 * attribute to be resized inside an mft record.  If not then there is
8147	 * no point in proceeding.
8148	 *
8149	 * This should never really happen as the attribute size should never
8150	 * be allowed to grow so much and such requests should never be made by
8151	 * the driver and if they are they should be caught by the call to
8152	 * ntfs_attr_size_bounds_check().
8153	 */
8154	if (arec_size > vol->mft_record_size - sizeof(MFT_RECORD)) {
8155		ntfs_error(vol->mp, "Cannot truncate mft_no 0x%llx, attribute "
8156				"type 0x%x, because the attribute may not be "
8157				"non-resident and the requested size exceeds "
8158				"the maximum possible resident attribute "
8159				"record size.", (unsigned long long)ni->mft_no,
8160				(unsigned)le32_to_cpu(ni->type));
8161		/* Use POSIX conformant truncate(2) error code. */
8162		err = EFBIG;
8163		goto err;
8164	}
8165	/*
8166	 * The resident attribute can fit in an mft record.  Now have to decide
8167	 * whether to make other attributes non-resident/move other attributes
8168	 * out of the mft record or whether to move the attribute record to be
8169	 * resized out to a new mft record.
8170	 *
8171	 * TODO: We never call ntfs_attr_resize() for attributes that cannot be
8172	 * non-resident thus we never get here thus we simply panic() here to
8173	 * remind us that we need to implement this code if we ever start
8174	 * calling this function for attributes that must remain resident.
8175	 */
8176	panic("%s(): Attribute may not be non-resident.\n", __FUNCTION__);
8177do_non_resident_resize:
8178	if (!NInoNonResident(ni))
8179		panic("%s(): !NInoNonResident(ni)\n", __FUNCTION__);
8180	/*
8181	 * If the size is shrinking, need to reduce the initialized_size and
8182	 * the data_size before reducing the allocation.
8183	 */
8184	if (size_change < 0) {
8185		/*
8186		 * Make the valid size smaller (the UBC size is already
8187		 * up-to-date).
8188		 */
8189		lck_spin_lock(&ni->size_lock);
8190		if (new_size < ni->initialized_size) {
8191			ni->initialized_size = new_size;
8192			a->initialized_size = cpu_to_sle64(new_size);
8193			lck_spin_unlock(&ni->size_lock);
8194			if (ni->name == I30 &&
8195					ni->type == AT_INDEX_ALLOCATION) {
8196				lck_spin_lock(&base_ni->size_lock);
8197				base_ni->initialized_size = new_size;
8198				lck_spin_unlock(&base_ni->size_lock);
8199			}
8200		} else
8201			lck_spin_unlock(&ni->size_lock);
8202		/*
8203		 * If the size is shrinking it makes no sense for the
8204		 * allocation to be growing.
8205		 */
8206		if (alloc_change > 0)
8207			panic("%s(): alloc_change > 0\n", __FUNCTION__);
8208	} else if (/*size_change >= 0 && */ alloc_change > 0){
8209		/*
8210		 * The file size is growing or staying the same but the
8211		 * allocation can be shrinking, growing or staying the same.
8212		 *
8213		 * If the allocating is shrinking or staying the same we fall
8214		 * down into the same code as the size shrinking base
8215		 * allocation shrinking.
8216		 *
8217		 * Only if the allocation is growing do we need to extend the
8218		 * allocation and possibly update the data size here.  If we
8219		 * are updating the data size, since we are not touching the
8220		 * initialized_size we do not need to worry about the actual
8221		 * data on disk.  And as far as the VM pages are concerned,
8222		 * there will be no pages beyond the old data size and any
8223		 * partial region in the last page between the old and new data
8224		 * size (or the end of the page if the new data size is outside
8225		 * the page) does not need to be modified as explained above
8226		 * for the resident attribute resize case.  To do this, we
8227		 * simply drop the locks we hold and leave all the work to our
8228		 * friendly helper ntfs_attr_extend_allocation().
8229		 *
8230		 * Note by setting @data_start to -1 (last parameter to
8231		 * ntfs_attr_extend_allocation()) we guarantee that the
8232		 * allocation is not partial.
8233		 */
8234		ntfs_attr_search_ctx_put(actx);
8235		ntfs_mft_record_unmap(base_ni);
8236		lck_rw_unlock_exclusive(&ni->rl.lock);
8237		err = ntfs_attr_extend_allocation(ni, new_size,
8238				size_change > 0 ? new_size : -1, -1, ictx,
8239				NULL, FALSE);
8240		if (err)
8241			goto err;
8242		goto done;
8243	}
8244	/* alloc_change <= 0 */
8245	/* If the actual size is changing need to update it now. */
8246	if (size_change) {
8247		lck_spin_lock(&ni->size_lock);
8248		ni->data_size = new_size;
8249		a->data_size = cpu_to_sle64(new_size);
8250		lck_spin_unlock(&ni->size_lock);
8251		if (ni->name == I30 && ni->type == AT_INDEX_ALLOCATION) {
8252			lck_spin_lock(&base_ni->size_lock);
8253			base_ni->data_size = new_size;
8254			lck_spin_unlock(&base_ni->size_lock);
8255		}
8256	}
8257	/* Ensure the modified mft record is written out. */
8258	NInoSetMrecNeedsDirtying(actx->ni);
8259	/* If the allocated size is not changing, we are done. */
8260	if (!alloc_change)
8261		goto unm_done;
8262	/*
8263	 * Free the clusters.  Note we cannot recover once this is done because
8264	 * someone else can allocate the clusters at any point after we free
8265	 * them.  Thus any errors will lead to a more or less corrupt file
8266	 * system depending on how consistent we can make the volume after an
8267	 * error occurs.
8268	 */
8269	err = ntfs_cluster_free(ni, new_alloc_size >>
8270			vol->cluster_size_shift, -1, actx, &nr_freed);
8271	m = actx->m;
8272	a = actx->a;
8273	if (err) {
8274		ntfs_error(vol->mp, "Failed to release cluster(s) (error "
8275				"%d).  Unmount and run chkdsk to recover the "
8276				"lost cluster(s).", err);
8277		NVolSetErrors(vol);
8278	} else {
8279		/*
8280		 * Truncate the runlist.  The call to ntfs_cluster_free() has
8281		 * already ensured that all needed runlist fragments have been
8282		 * mapped so we do not need to worry about mapping runlist
8283		 * fragments here.  Note given we have managed to read all the
8284		 * runlist fragments already the chances of us failing anywhere
8285		 * in the below code is very small indeed.  Only running out of
8286		 * memory or a disk/sector failure between the above
8287		 * ntfs_cluster_free() call and the below calls can cause us to
8288		 * fail here.
8289		 *
8290		 * FIXME: Note that this is not quite true as if
8291		 * ntfs_cluster_free() aborts with an error it may not have
8292		 * gotten round to mapping the runlist fragments.  If this
8293		 * happens ntfs_rl_truncate_nolock() could end up doing a lot
8294		 * of weird things so we only call it if the
8295		 * ntfs_cluster_free() succeeded for now.
8296		 */
8297		err = ntfs_rl_truncate_nolock(vol, &ni->rl, new_alloc_size >>
8298				vol->cluster_size_shift);
8299	}
8300	/*
8301	 * If the runlist truncation failed and/or the search context is no
8302	 * longer valid, we cannot resize the attribute record or build the
8303	 * mapping pairs array thus we abort.
8304	 */
8305	if (err || actx->is_error) {
8306		if (actx->is_error)
8307			err = actx->error;
8308		ntfs_error(vol->mp, "Failed to %s (error %d).%s",
8309				actx->is_error ?
8310				"restore attribute search context" :
8311				"truncate attribute runlist", err, es);
8312		err = EIO;
8313		goto bad_out;
8314	}
8315	/*
8316	 * The runlist is now up to date.  If this attribute is sparse we need
8317	 * to check if it is still sparse and if not we need to change it to a
8318	 * non-sparse file.  And if it is still sparse we need to update the
8319	 * compressed size which we postpone till later so we can do it at the
8320	 * same time as the update of the allocated size.
8321	 *
8322	 * To determine whether the attribute is still sparse we compare the
8323	 * new compressed size to the new allocated size.  If the two have now
8324	 * become the same the attribute is no longer sparse.  If the
8325	 * compressed size is still smaller than the allocated size the
8326	 * attribute is still sparse.
8327	 */
8328	compressed_size -= nr_freed << vol->cluster_size_shift;
8329	if (NInoSparse(ni) && compressed_size >= new_alloc_size) {
8330		if (compressed_size > new_alloc_size)
8331			panic("%s(): compressed_size > new_alloc_size\n",
8332					__FUNCTION__);
8333		/* Switch the attribute to not be sparse any more. */
8334		ntfs_attr_sparse_clear(base_ni, ni, actx);
8335	}
8336	/* Update the allocated/compressed size. */
8337	lck_spin_lock(&ni->size_lock);
8338	ni->allocated_size = new_alloc_size;
8339	a->allocated_size = cpu_to_sle64(new_alloc_size);
8340	if (NInoSparse(ni) || (ni->type != AT_INDEX_ALLOCATION &&
8341			NInoCompressed(ni))) {
8342		if (nr_freed) {
8343			if (compressed_size < 0)
8344				panic("%s(): compressed_size < 0\n",
8345						__FUNCTION__);
8346			ni->compressed_size = compressed_size;
8347			a->compressed_size = cpu_to_sle64(ni->compressed_size);
8348		}
8349	}
8350	lck_spin_unlock(&ni->size_lock);
8351	if (ni->name == I30 && ni->type == AT_INDEX_ALLOCATION) {
8352		lck_spin_lock(&base_ni->size_lock);
8353		base_ni->allocated_size = new_alloc_size;
8354		lck_spin_unlock(&base_ni->size_lock);
8355	}
8356	/*
8357	 * We have the base attribute extent in @actx and we have set it up
8358	 * already with the new allocated size.  If the truncation point is not
8359	 * in the base extent, need to switch to the extent containing the
8360	 * truncation point now so we can update its attribute record, too.
8361	 * But before doing so need to ensure the modified mft record is
8362	 * written out.
8363	 */
8364	highest_vcn = new_alloc_size >> vol->cluster_size_shift;
8365	old_highest_vcn = sle64_to_cpu(a->highest_vcn) + 1;
8366	ntfs_debug("highest_vcn 0x%llx, old_highest_vcn 0x%llx.",
8367			(unsigned long long)highest_vcn,
8368			(unsigned long long)old_highest_vcn);
8369	if (highest_vcn >= old_highest_vcn) {
8370		NInoSetMrecNeedsDirtying(actx->ni);
8371		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
8372				highest_vcn, NULL, 0, actx);
8373		if (err) {
8374			if (err == ENOENT)
8375				ntfs_error(vol->mp, "Attribute extent is "
8376						"missing from mft_no 0x%llx.  "
8377						"Run chkdsk.",
8378						(unsigned long long)
8379						ni->mft_no);
8380			else
8381				ntfs_error(vol->mp, "Failed to lookup "
8382						"attribute extent in mft_no "
8383						"0x%llx (error %d).%s",
8384						(unsigned long long)
8385						ni->mft_no, err, es);
8386			err = EIO;
8387			goto bad_out;
8388		}
8389		m = actx->m;
8390		a = actx->a;
8391		old_highest_vcn = sle64_to_cpu(a->highest_vcn) + 1;
8392		ntfs_debug("Switched to extent attribute record, "
8393				"old_highest_vcn is now 0x%llx.",
8394				(unsigned long long)old_highest_vcn);
8395	}
8396	/*
8397	 * If the truncation point is at the very beginning of this attribute
8398	 * extent and the extent is not the base extent we need to remove the
8399	 * entire extent and hence do not need to waste time truncating it.
8400	 *
8401	 * If this is the base extent we have to truncate it to zero allocated
8402	 * size and if the truncation point is in the middle of the extent we
8403	 * need to truncate it to the truncation point.
8404	 */
8405	lowest_vcn = sle64_to_cpu(a->lowest_vcn);
8406	ntfs_debug("lowest_vcn 0x%llx.", (unsigned long long)lowest_vcn);
8407	if (!lowest_vcn || highest_vcn != lowest_vcn) {
8408		/*
8409		 * Get the size for the shrunk mapping pairs array for the
8410		 * runlist fragment starting at the lowest_vcn of this extent.
8411		 */
8412		err = ntfs_get_size_for_mapping_pairs(vol,
8413				ni->rl.elements ? ni->rl.rl : NULL, lowest_vcn,
8414				-1, &mp_size);
8415		if (err) {
8416			ntfs_error(vol->mp, "Cannot shrink allocation of "
8417					"mft_no 0x%llx, attribute type 0x%x, "
8418					"because determining the size for the "
8419					"mapping pairs failed (error %d).%s",
8420					(unsigned long long)ni->mft_no,
8421					(unsigned)le32_to_cpu(ni->type), err,
8422					es);
8423			NInoSetMrecNeedsDirtying(actx->ni);
8424			err = EIO;
8425			goto bad_out;
8426		}
8427		/*
8428		 * Generate the mapping pairs array directly into the attribute
8429		 * record.
8430		 */
8431		err = ntfs_mapping_pairs_build(vol, (s8*)a +
8432				le16_to_cpu(a->mapping_pairs_offset), mp_size,
8433				ni->rl.elements ? ni->rl.rl : NULL, lowest_vcn,
8434				-1, NULL);
8435		if (err) {
8436			ntfs_error(vol->mp, "Cannot shrink allocation of "
8437					"mft_no 0x%llx, attribute type 0x%x, "
8438					"because building the mapping pairs "
8439					"failed (error %d).%s",
8440					(unsigned long long)ni->mft_no,
8441					(unsigned)le32_to_cpu(ni->type), err,
8442					es);
8443			NInoSetMrecNeedsDirtying(actx->ni);
8444			err = EIO;
8445			goto bad_out;
8446		}
8447		/* Update the highest_vcn to the new truncation point. */
8448		a->highest_vcn = cpu_to_sle64(highest_vcn - 1);
8449		/*
8450		 * Shrink the attribute record for the new mapping pairs array.
8451		 * Note, this cannot fail since we are making the attribute
8452		 * smaller thus by definition there is enough space to do so.
8453		 */
8454		attr_len = le32_to_cpu(a->length);
8455		err = ntfs_attr_record_resize(m, a, mp_size +
8456				le16_to_cpu(a->mapping_pairs_offset));
8457		if (err)
8458			panic("%s(): err\n", __FUNCTION__);
8459	}
8460	/* If there is no attribute list we are done. */
8461	if (!NInoAttrList(base_ni)) {
8462		/* Ensure the modified mft record is written out. */
8463		NInoSetMrecNeedsDirtying(base_ni);
8464		goto unm_done;
8465	}
8466	/*
8467	 * If the current extent is not the base extent and it has a lowest_vcn
8468	 * equal to the new highest_vcn, we need to delete the current extent.
8469	 *
8470	 * Also need to delete all subsequent attribute extents if any exist.
8471	 * We know that some exist if the old highest_vcn of the current extent
8472	 * is lower than the old end of the attribute.
8473	 *
8474	 * When deleting the attribute extents, free the extent mft records if
8475	 * the only attribute record in the mft record is the attribute extent
8476	 * being deleted.  In this case do not need to actually modify the
8477	 * attribute record at all, just mark the mft record as not in use and
8478	 * clear its bit in the mft bitmap.  For each deleted attribute extent
8479	 * also need to delete the corresponding attribute list attribute
8480	 * entry but we postpone this until we have dealt with all the extents
8481	 * first.
8482	 *
8483	 * When finished, check the attribute list attribute and if it no
8484	 * longer references any mft records other than the base mft record
8485	 * delete the attribute list attribute altogether.
8486	 */
8487	al_end = base_ni->attr_list + base_ni->attr_list_size;
8488	del_al_start = (u8*)actx->al_entry;
8489	if (lowest_vcn && highest_vcn == lowest_vcn) {
8490		/*
8491		 * We need to delete the current extent thus manually
8492		 * reinitialize the attribute search context without unmapping
8493		 * the current extent.
8494		 */
8495		eni = actx->ni;
8496		actx->ni = base_ni;
8497		ntfs_attr_search_ctx_reinit(actx);
8498		al_entry = (ATTR_LIST_ENTRY*)del_al_start;
8499		goto delete_attr;
8500	}
8501	/* Ensure the modified mft record is written out. */
8502	NInoSetMrecNeedsDirtying(actx->ni);
8503	del_al_start += le16_to_cpu(((ATTR_LIST_ENTRY*)del_al_start)->length);
8504	al_entry = (ATTR_LIST_ENTRY*)del_al_start;
8505	/*
8506	 * Reinitialize the attribute search context thus unmapping the current
8507	 * extent if it is not in the base mft record.
8508	 */
8509	ntfs_attr_search_ctx_reinit(actx);
8510	/*
8511	 * Check if there are more extents by looking at the highest vcn of the
8512	 * current extent which is in @old_highest_vcn.  If it is below the old
8513	 * allocated size it means that @al_entry points to the attribute list
8514	 * entry describing the next attribute extent.
8515	 */
8516	while (old_highest_vcn < (old_alloc_size >> vol->cluster_size_shift)) {
8517		/* Sanity checks. */
8518		if ((u8*)al_entry + sizeof(ATTR_LIST_ENTRY) >= al_end ||
8519				(u8*)al_entry < base_ni->attr_list) {
8520			ntfs_error(vol->mp, "Attribute list attribute is "
8521					"corrupt in mft_no 0x%llx.  Run "
8522					"chkdsk.",
8523					(unsigned long long)base_ni->mft_no);
8524			err = EIO;
8525			goto bad_out;
8526		}
8527		/*
8528		 * Map the mft record containing the next extent if it is not
8529		 * the base mft record which is already mapped and described by
8530		 * the attribute search context @actx.
8531		 */
8532		if (MREF_LE(al_entry->mft_reference) == base_ni->mft_no) {
8533			/* We want the base mft record. */
8534			if (MSEQNO_LE(al_entry->mft_reference) !=
8535					base_ni->seq_no) {
8536				ntfs_error(vol->mp, "Found stale mft "
8537						"reference in attribute list "
8538						"attribute of mft_no 0x%llx.  "
8539						"Inode is corrupt.  Run "
8540						"chkdsk.", (unsigned long long)
8541						base_ni->mft_no);
8542				err = EIO;
8543				goto bad_out;
8544			}
8545			eni = base_ni;
8546			m = actx->m;
8547		} else {
8548			/* We want an extent mft record. */
8549			err = ntfs_extent_mft_record_map(base_ni,
8550					le64_to_cpu(al_entry->mft_reference),
8551					&eni, &m);
8552			if (err) {
8553				ntfs_error(vol->mp, "Failed to map extent mft "
8554						"record 0x%llx of mft_no "
8555						"0x%llx.  Inode is corrupt.  "
8556						"Run chkdsk.",
8557						(unsigned long long)MREF_LE(
8558						al_entry->mft_reference),
8559						(unsigned long long)
8560						base_ni->mft_no);
8561				err = EIO;
8562				goto bad_out;
8563			}
8564		}
8565		/* Locate the attribute extent in the mft record. */
8566		a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset));
8567		do {
8568			/* Sanity checks. */
8569			if ((u8*)a < (u8*)m || (u8*)a > (u8*)m +
8570					le32_to_cpu(m->bytes_allocated))
8571				goto corrupt_err;
8572			/*
8573			 * We cannot reach the end of the attributes without
8574			 * finding the attribute extent we are looking for.
8575			 */
8576			if (a->type == AT_END || !a->length)
8577				goto corrupt_err;
8578			/*
8579			 * The attribute instance is unique thus if we find the
8580			 * correct instance we have found the attribute extent.
8581			 */
8582			if (al_entry->instance == a->instance) {
8583				/*
8584				 * If the type and/or the name are mismatched
8585				 * between the attribute list entry and the
8586				 * attribute record, there is corruption.
8587				 */
8588				if (al_entry->type != a->type)
8589					goto corrupt_err;
8590				if (!ntfs_are_names_equal((ntfschar*)((u8*)a +
8591						le16_to_cpu(a->name_offset)),
8592						a->name_length,
8593						(ntfschar*)((u8*)al_entry +
8594						al_entry->name_offset),
8595						al_entry->name_length,
8596						NVolCaseSensitive(vol),
8597						vol->upcase, vol->upcase_len))
8598					goto corrupt_err;
8599				/* We found the attribute extent. */
8600				break;
8601			}
8602			/* Proceed to the next attribute in the mft record. */
8603			a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length));
8604		} while (1);
8605		/* Record the highest_vcn of the new extent. */
8606		old_highest_vcn = sle64_to_cpu(a->highest_vcn) + 1;
8607delete_attr:
8608		/*
8609		 * If this is the only attribute record in the mft record, free
8610		 * the mft record.  Note if this is the case it is not possible
8611		 * for the mft record to be the base record as it would at
8612		 * least have to contain the attribute record for the attribute
8613		 * list attribute so no need to check for this case.
8614		 *
8615		 * If it is not the only attribute record in the mft record,
8616		 * delete the attribute record from the mft record.
8617		 */
8618		if ((u8*)m + le16_to_cpu(m->attrs_offset) == (u8*)a &&
8619				((ATTR_RECORD*)((u8*)a +
8620				le32_to_cpu(a->length)))->type == AT_END) {
8621			err = ntfs_extent_mft_record_free(base_ni, eni, m);
8622			if (err) {
8623				ntfs_error(vol->mp, "Failed to free extent "
8624						"mft_no 0x%llx (error %d).  "
8625						"Unmount and run chkdsk to "
8626						"recover the lost inode.",
8627						(unsigned long long)
8628						eni->mft_no, err);
8629				NVolSetErrors(vol);
8630				if (eni != base_ni) {
8631					NInoSetMrecNeedsDirtying(eni);
8632					ntfs_extent_mft_record_unmap(eni);
8633				}
8634			}
8635		} else {
8636			ntfs_attr_record_delete_internal(m, a);
8637			/* Unmap the mft record if it is not the base record. */
8638			if (eni != base_ni) {
8639				NInoSetMrecNeedsDirtying(eni);
8640				ntfs_extent_mft_record_unmap(eni);
8641			}
8642		}
8643		/* Go to the next entry in the attribute list attribute. */
8644		al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
8645				le16_to_cpu(al_entry->length));
8646	}
8647	/*
8648	 * There are no more extents.  If we deleted any attribute extents we
8649	 * need to remove their attribute list attribute entries now.
8650	 */
8651	if ((u8*)al_entry != del_al_start) {
8652		unsigned al_ofs;
8653		BOOL have_extent_records;
8654
8655		al_ofs = del_al_start - base_ni->attr_list;
8656		ntfs_attr_list_entries_delete(base_ni,
8657				(ATTR_LIST_ENTRY*)del_al_start, al_entry);
8658		/*
8659		 * Scan all entries in the attribute list attribute.  If there
8660		 * are no more references to extent mft records, delete the
8661		 * attribute list attribute.
8662		 *
8663		 * Otherwise truncate the attribute list attribute and update
8664		 * its value from the in memory copy.
8665		 */
8666		err = ntfs_attr_list_is_needed(base_ni, NULL,
8667				&have_extent_records);
8668		if (err)
8669			goto put_err;
8670		if (!have_extent_records) {
8671			/*
8672			 * There are no extent mft records left in use thus
8673			 * delete the attribute list attribute.
8674			 */
8675			err = ntfs_attr_list_delete(base_ni, actx);
8676			if (err)
8677				goto put_err;
8678		} else {
8679			/*
8680			 * There still are extent mft records left in use thus
8681			 * update the attribute list attribute size and write
8682			 * the modified data to disk.
8683			 */
8684			err = ntfs_attr_list_sync_shrink(base_ni, al_ofs, actx);
8685			if (err)
8686				goto put_err;
8687		}
8688	}
8689unm_done:
8690	ntfs_attr_search_ctx_put(actx);
8691	ntfs_mft_record_unmap(base_ni);
8692	lck_rw_unlock_exclusive(&ni->rl.lock);
8693	/* Set the UBC size if not set yet. */
8694	if (need_ubc_setsize && !ubc_setsize(ni->vn, new_size)) {
8695		ntfs_error(vol->mp, "Failed to set the size in UBC.");
8696		err = EIO;
8697		/*
8698		 * This should never fail and if it does it can only happen as
8699		 * the result of a previous resize having failed.  Thus we do
8700		 * not try to roll back the metadata changes and simply bail
8701		 * out.
8702		 */
8703		goto err;
8704	}
8705done:
8706	/*
8707	 * If we have modified the size of the base inode, cause the sizes to
8708	 * be written to all the directory index entries pointing to the base
8709	 * inode when the inode is written to disk.  Do not do this for
8710	 * directories as they have both sizes set to zero in their index
8711	 * entries.
8712	 */
8713	if (ni == base_ni && !S_ISDIR(ni->mode) &&
8714			(size_change || alloc_change))
8715		NInoSetDirtySizes(ni);
8716	// TODO:/FIXME: We have to clear the S_ISUID and S_ISGID bits in the
8717	// file mode. - Only to be done on success and (size_change ||
8718	// alloc_change).
8719	/*
8720	 * Update the last_data_change_time (mtime) and last_mft_change_time
8721	 * (ctime) on the base ntfs inode @base_ni unless this is an attribute
8722	 * inode update in which case only update the ctime as named stream/
8723	 * extended attribute semantics expect on OS X.
8724	 *
8725	 * FIXME: For open(O_TRUNC) it is correct to always change the
8726	 * {m,c}time.  But for {,f}truncate() we have to only set {m,c}time if
8727	 * a change happened, i.e. only if size_change is true.  Problem is we
8728	 * cannot know from which code path we are being called as both system
8729	 * calls on OS X call vnode_setattr() which calls VNOP_SETATTR() which
8730	 * calls ntfs_vnop_setattr() which then calls us...  For now at least
8731	 * we always update the times thus we follow open(O_TRUNC) semantics
8732	 * and disobey {,f}truncate() semantics.
8733	 */
8734	base_ni->last_mft_change_time = ntfs_utc_current_time();
8735	if (ni == base_ni)
8736		base_ni->last_data_change_time = base_ni->last_mft_change_time;
8737	NInoSetDirtyTimes(base_ni);
8738	/*
8739	 * If this is not a directory or it is an encrypted directory, set the
8740	 * needs archiving bit except for the core system files.
8741	 */
8742	if (!S_ISDIR(base_ni->mode) || NInoEncrypted(base_ni)) {
8743		BOOL need_set_archive_bit = TRUE;
8744		if (vol->major_ver >= 2) {
8745			if (ni->mft_no <= FILE_Extend)
8746				need_set_archive_bit = FALSE;
8747		} else {
8748			if (ni->mft_no <= FILE_UpCase)
8749				need_set_archive_bit = FALSE;
8750		}
8751		if (need_set_archive_bit) {
8752			base_ni->file_attributes |= FILE_ATTR_ARCHIVE;
8753			NInoSetDirtyFileAttributes(base_ni);
8754		}
8755	}
8756	ntfs_debug("Done.");
8757	return 0;
8758corrupt_err:
8759	ntfs_error(vol->mp, "Mft record 0x%llx of mft_no 0x%llx is corrupt.  "
8760			"Unmount and run chkdsk.",
8761			(unsigned long long)eni->mft_no,
8762			(unsigned long long)base_ni->mft_no);
8763	if (eni != base_ni)
8764		ntfs_extent_mft_record_unmap(eni);
8765	err = EIO;
8766bad_out:
8767	if (err != ENOMEM && err != ENOTSUP)
8768		NVolSetErrors(vol);
8769put_err:
8770	ntfs_attr_search_ctx_put(actx);
8771unm_err:
8772	ntfs_mft_record_unmap(base_ni);
8773unl_err:
8774	lck_rw_unlock_exclusive(&ni->rl.lock);
8775err:
8776	/* Reset the UBC size. */
8777	if (!ubc_setsize(ni->vn, old_size))
8778		ntfs_error(vol->mp, "Failed to restore UBC size.  Leaving UBC "
8779				"size out of sync with attribute data size.");
8780	ntfs_debug("Failed (error %d).", err);
8781	return err;
8782}
8783
8784/**
8785 * ntfs_attr_set - fill (a part of) an attribute with a byte
8786 * @ni:		ntfs inode describing the attribute to fill
8787 * @ofs:	offset inside the attribute at which to start to fill
8788 * @cnt:	number of bytes to fill
8789 * @val:	the unsigned 8-bit value with which to fill the attribute
8790 *
8791 * Fill @cnt bytes of the attribute described by the ntfs inode @ni starting at
8792 * byte offset @ofs inside the attribute with the constant byte @val.
8793 *
8794 * This function is effectively like memset() applied to an ntfs attribute.
8795 * Note this function actually only operates on the page cache pages belonging
8796 * to the ntfs attribute and it marks them dirty after doing the memset().
8797 * Thus it relies on the vm dirty page write code paths to cause the modified
8798 * pages to be written to the mft record/disk.
8799 *
8800 * Return 0 on success and errno on error.  An error code of ESPIPE means that
8801 * @ofs + @cnt were outside the end of the attribute and no write was
8802 * performed.
8803 *
8804 * Note: This function does not take care of the initialized size!
8805 *
8806 * Locking: - Caller must hold an iocount reference on the vnode of the ntfs
8807 *	      inode @ni.
8808 *	    - Caller must hold @ni->lock for reading or writing.
8809 */
8810errno_t ntfs_attr_set(ntfs_inode *ni, s64 ofs, const s64 cnt, const u8 val)
8811{
8812	s64 end, data_size;
8813	ntfs_volume *vol = ni->vol;
8814	upl_t upl;
8815	upl_page_info_array_t pl;
8816	u8 *kaddr;
8817	unsigned start_ofs, end_ofs, size;
8818	errno_t err;
8819
8820	ntfs_debug("Entering for ofs 0x%llx, cnt 0x%llx, val 0x%x.",
8821			(unsigned long long)ofs, (unsigned long long)cnt,
8822			(unsigned)val);
8823	if (ofs < 0)
8824		panic("%s(): ofs < 0\n", __FUNCTION__);
8825	if (cnt < 0)
8826		panic("%s(): cnt < 0\n", __FUNCTION__);
8827	if (!cnt)
8828		goto done;
8829	/*
8830	 * FIXME: Compressed and encrypted attributes are not supported when
8831	 * writing and we should never have gotten here for them.
8832	 */
8833	if (NInoCompressed(ni))
8834		panic("%s(): Inode is compressed.\n", __FUNCTION__);
8835	if (NInoEncrypted(ni))
8836		panic("%s(): Inode is encrypted.\n", __FUNCTION__);
8837	/* Work out the starting index and page offset. */
8838	start_ofs = (unsigned)ofs & PAGE_MASK;
8839	/* Work out the ending index and page offset. */
8840	end = ofs + cnt;
8841	end_ofs = (unsigned)end & PAGE_MASK;
8842	/* If the end is outside the inode size return ESPIPE. */
8843	lck_spin_lock(&ni->size_lock);
8844	data_size = ni->data_size;
8845	lck_spin_unlock(&ni->size_lock);
8846	if (end > data_size) {
8847		ntfs_error(vol->mp, "Request exceeds end of attribute.");
8848		return ESPIPE;
8849	}
8850	ofs &= ~PAGE_MASK_64;
8851	end &= ~PAGE_MASK_64;
8852	/* If there is a first partial page, need to do it the slow way. */
8853	if (start_ofs) {
8854		err = ntfs_page_map(ni, ofs, &upl, &pl, &kaddr, TRUE);
8855		if (err) {
8856			ntfs_error(vol->mp, "Failed to read first partial "
8857					"page (ofs 0x%llx).",
8858					(unsigned long long)ofs);
8859			return err;
8860		}
8861		/*
8862		 * If the last page is the same as the first page, need to
8863		 * limit the write to the end offset.
8864		 */
8865		size = PAGE_SIZE;
8866		if (ofs == end)
8867			size = end_ofs;
8868		memset(kaddr + start_ofs, val, size - start_ofs);
8869		ntfs_page_unmap(ni, upl, pl, TRUE);
8870		ofs += PAGE_SIZE;
8871		if (ofs >= (end + end_ofs))
8872			goto done;
8873	}
8874	/*
8875	 * Do the whole pages the fast way.
8876	 *
8877	 * TODO: It may be possible to optimize this loop by creating a
8878	 * sequence of large page lists by hand, mapping them, then running the
8879	 * memset, then unmapping them and committing them.  This incurs a
8880	 * higher cpu time because of the larger mapping required but incurs
8881	 * many fewer calls into the ubc thus less locks will need to be taken
8882	 * which may well speed things up a lot.  It will need to be
8883	 * benchmarked to determine which is actually faster so leaving it the
8884	 * easier way for now.
8885	 */
8886	for (; ofs < end; ofs += PAGE_SIZE) {
8887		/* Find or create the current page. */
8888		err = ntfs_page_grab(ni, ofs, &upl, &pl, &kaddr, TRUE);
8889		if (err) {
8890			ntfs_error(vol->mp, "Failed to grab page (ofs "
8891					"0x%llx).", (unsigned long long)ofs);
8892			return err;
8893		}
8894		memset(kaddr, val, PAGE_SIZE);
8895		ntfs_page_unmap(ni, upl, pl, TRUE);
8896	}
8897	/* If there is a last partial page, need to do it the slow way. */
8898	if (end_ofs) {
8899		err = ntfs_page_map(ni, ofs, &upl, &pl, &kaddr, TRUE);
8900		if (err) {
8901			ntfs_error(vol->mp, "Failed to read last partial page "
8902					"(ofs 0x%llx).",
8903					(unsigned long long)ofs);
8904			return err;
8905		}
8906		memset(kaddr, val, end_ofs);
8907		ntfs_page_unmap(ni, upl, pl, TRUE);
8908	}
8909done:
8910	ntfs_debug("Done.");
8911	return 0;
8912}
8913
8914/**
8915 * ntfs_resident_attr_read - read from an attribute which is resident
8916 * @ni:		resident ntfs inode describing the attribute from which to read
8917 * @ofs:	byte offset in attribute at which to start reading
8918 * @cnt:	number of bytes to copy into the destination buffer @buf
8919 * @buf:	destination buffer into which to copy attribute data
8920 *
8921 * Map the base mft record of the ntfs inode @ni, find the attribute it
8922 * describes, and copy @cnt bytes from byte offset @ofs into the destination
8923 * buffer @buf.  If @buf is bigger than the attribute size, zero the remainder.
8924 *
8925 * We do not need to worry about compressed attributes because when they are
8926 * resident the data is not actually compressed and we do not need to worry
8927 * about encrypted attributes because encrypted attributes cannot be resident.
8928 *
8929 * Return 0 on success and errno on error.  Note that a return value of EAGAIN
8930 * means that someone converted the attribute to non-resident before we took
8931 * the necessary locks to read from the resident attribute thus we could not
8932 * perform the read.  The caller needs to cope with this and perform a
8933 * non-resident read instead.
8934 */
8935errno_t ntfs_resident_attr_read(ntfs_inode *ni, const s64 ofs, const u32 cnt,
8936		u8 *buf)
8937{
8938	s64 max_size;
8939	ntfs_inode *base_ni;
8940	MFT_RECORD *m;
8941	ntfs_attr_search_ctx *ctx;
8942	ATTR_RECORD *a;
8943	unsigned attr_len, init_len, bytes;
8944	errno_t err;
8945
8946	base_ni = ni;
8947	if (NInoAttr(ni))
8948		base_ni = ni->base_ni;
8949	/* Map, pin, and lock the mft record. */
8950	err = ntfs_mft_record_map(base_ni, &m);
8951	if (err)
8952		goto err;
8953	/*
8954	 * If a parallel write made the attribute non-resident, drop the mft
8955	 * record and return EAGAIN.
8956	 */
8957	if (NInoNonResident(ni)) {
8958		err = EAGAIN;
8959		goto unm_err;
8960	}
8961	ctx = ntfs_attr_search_ctx_get(base_ni, m);
8962	if (!ctx) {
8963		err = ENOMEM;
8964		goto unm_err;
8965	}
8966	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 0, NULL, 0,
8967			ctx);
8968	if (err) {
8969		if (err == ENOENT)
8970			err = EIO;
8971		goto put_err;
8972	}
8973	a = ctx->a;
8974	lck_spin_lock(&ni->size_lock);
8975	/* These can happen when we race with a shrinking truncate. */
8976	attr_len = le32_to_cpu(a->value_length);
8977	if (attr_len > ni->data_size)
8978		attr_len = ni->data_size;
8979	max_size = ubc_getsize(ni->vn);
8980	if (attr_len > max_size)
8981		attr_len = max_size;
8982	init_len = attr_len;
8983	if (init_len > ni->initialized_size)
8984		init_len = ni->initialized_size;
8985	lck_spin_unlock(&ni->size_lock);
8986	/*
8987	 * If we are reading from the initialized attribute part, copy the data
8988	 * over into the destination buffer.
8989	 */
8990	bytes = cnt;
8991	if (init_len > ofs) {
8992		u32 available = init_len - ofs;
8993		if (bytes > available)
8994			bytes = available;
8995		memcpy(buf, (u8*)a + le16_to_cpu(a->value_offset) + ofs, bytes);
8996	}
8997	/* Zero the remainder of the destination buffer if any. */
8998	if (bytes < cnt)
8999		bzero(buf + bytes, cnt - bytes);
9000put_err:
9001	ntfs_attr_search_ctx_put(ctx);
9002unm_err:
9003	ntfs_mft_record_unmap(base_ni);
9004err:
9005	return err;
9006}
9007
9008/**
9009 * ntfs_resident_attr_write - write to an attribute which is resident
9010 * @ni:		resident ntfs inode describing the attribute to which to write
9011 * @buf:	source buffer from which to copy attribute data
9012 * @cnt:	number of bytes to copy into the attribute from the buffer
9013 * @ofs:	byte offset in attribute at which to start writing
9014 *
9015 * Map the base mft record of the ntfs inode @ni, find the attribute it
9016 * describes, and copy @cnt bytes from the buffer @buf into the attribute value
9017 * at byte offset @ofs.
9018 *
9019 * We do not need to worry about compressed attributes because when they are
9020 * resident the data is not actually compressed and we do not need to worry
9021 * about encrypted attributes because encrypted attributes cannot be resident.
9022 *
9023 * Return 0 on success and errno on error.  Note that a return value of EAGAIN
9024 * means that someone converted the attribute to non-resident before we took
9025 * the necessary locks to write to the resident attribute thus we could not
9026 * perform the write.  The caller needs to cope with this and perform a
9027 * non-resident write instead.
9028 */
9029errno_t ntfs_resident_attr_write(ntfs_inode *ni, u8 *buf, u32 cnt,
9030		const s64 ofs)
9031{
9032	ntfs_inode *base_ni;
9033	MFT_RECORD *m;
9034	ntfs_attr_search_ctx *ctx;
9035	ATTR_RECORD *a;
9036	errno_t err;
9037	u32 attr_len;
9038
9039	base_ni = ni;
9040	if (NInoAttr(ni))
9041		base_ni = ni->base_ni;
9042	/* Map, pin, and lock the mft record. */
9043	err = ntfs_mft_record_map(base_ni, &m);
9044	if (err)
9045		goto err;
9046	/*
9047	 * If a parallel write made the attribute non-resident, drop the mft
9048	 * record and return EAGAIN.
9049	 */
9050	if (NInoNonResident(ni)) {
9051		err = EAGAIN;
9052		goto unm_err;
9053	}
9054	ctx = ntfs_attr_search_ctx_get(base_ni, m);
9055	if (!ctx) {
9056		err = ENOMEM;
9057		goto unm_err;
9058	}
9059	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, 0, NULL, 0,
9060			ctx);
9061	if (err) {
9062		if (err == ENOENT)
9063			err = EIO;
9064		goto put_err;
9065	}
9066	a = ctx->a;
9067	if (a->non_resident)
9068		panic("%s(): a->non_resident\n", __FUNCTION__);
9069	lck_spin_lock(&ni->size_lock);
9070	/* These can happen when we race with a shrinking truncate. */
9071	attr_len = le32_to_cpu(a->value_length);
9072	if (ofs > attr_len) {
9073		ntfs_error(ni->vol->mp, "Cannot write past end of resident "
9074				"attribute.");
9075		lck_spin_unlock(&ni->size_lock);
9076		err = EINVAL;
9077		goto put_err;
9078	}
9079	if (ofs + cnt > attr_len) {
9080		ntfs_error(ni->vol->mp, "Truncating resident write.");
9081		cnt = attr_len - ofs;
9082	}
9083	if (ofs + cnt > ni->initialized_size)
9084		ni->initialized_size = ofs + cnt;
9085	lck_spin_unlock(&ni->size_lock);
9086	/* Copy the data over from the destination buffer. */
9087	memcpy((u8*)a + le16_to_cpu(a->value_offset) + ofs, buf, cnt);
9088	/* Mark the mft record dirty to ensure it gets written out. */
9089	NInoSetMrecNeedsDirtying(ctx->ni);
9090put_err:
9091	ntfs_attr_search_ctx_put(ctx);
9092unm_err:
9093	ntfs_mft_record_unmap(base_ni);
9094err:
9095	return err;
9096}
9097