1/*
2 * ntfs_lcnalloc.c - NTFS kernel cluster (de)allocation code.
3 *
4 * Copyright (c) 2006-2011 Anton Altaparmakov.  All Rights Reserved.
5 * Portions Copyright (c) 2006-2011 Apple Inc.  All Rights Reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 *    this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 * 3. Neither the name of Apple Inc. ("Apple") nor the names of its
16 *    contributors may be used to endorse or promote products derived from this
17 *    software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
20 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
23 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
26 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 *
30 * ALTERNATIVELY, provided that this notice and licensing terms are retained in
31 * full, this file may be redistributed and/or modified under the terms of the
32 * GNU General Public License (GPL) Version 2, in which case the provisions of
33 * that version of the GPL will apply to you instead of the license terms
34 * above.  You can obtain a copy of the GPL Version 2 at
35 * http://developer.apple.com/opensource/licenses/gpl-2.txt.
36 */
37
38#include <sys/buf.h>
39#include <sys/errno.h>
40#include <sys/stat.h>
41#include <sys/ucred.h>
42#include <sys/vnode.h>
43
44#include <string.h>
45
46#include <libkern/OSMalloc.h>
47
48#include <kern/debug.h>
49#include <kern/locks.h>
50
51#include "ntfs.h"
52#include "ntfs_attr.h"
53#include "ntfs_bitmap.h"
54#include "ntfs_debug.h"
55#include "ntfs_inode.h"
56#include "ntfs_lcnalloc.h"
57#include "ntfs_page.h"
58#include "ntfs_runlist.h"
59#include "ntfs_types.h"
60#include "ntfs_volume.h"
61
62static errno_t ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
63		ntfs_rl_element *rl, const VCN start_vcn, s64 count,
64		s64 *nr_freed);
65
66/**
67 * ntfs_cluster_alloc - allocate clusters on an ntfs volume
68 * @vol:		mounted ntfs volume on which to allocate the clusters
69 * @start_vcn:		vcn to use for the first allocated cluster
70 * @count:		number of clusters to allocate
71 * @start_lcn:		starting lcn at which to allocate the clusters (or -1)
72 * @zone:		zone from which to allocate the clusters
73 * @is_extension:	if true, this is an attribute extension
74 * @runlist:		destination runlist to return the allocated clusters in
75 *
76 * Allocate @count clusters preferably starting at cluster @start_lcn or at the
77 * current allocator position if @start_lcn is -1, on the mounted ntfs volume
78 * @vol.  @zone is either DATA_ZONE for allocation of normal clusters or
79 * MFT_ZONE for allocation of clusters for the master file table, i.e. the
80 * $MFT/$DATA attribute.
81 *
82 * @start_vcn specifies the vcn of the first allocated cluster.  This makes
83 * merging the resulting runlist with the old runlist easier.
84 *
85 * If @is_extension is true, the caller is allocating clusters to extend an
86 * attribute and if it is false, the caller is allocating clusters to fill a
87 * hole in an attribute.  Practically the difference is that if @is_extension
88 * is true the returned runlist will be terminated with LCN_ENOENT and if
89 * @is_extension is false the runlist will be terminated with
90 * LCN_RL_NOT_MAPPED.
91 *
92 * On success return 0 and set up @runlist to describe the allocated clusters.
93 *
94 * On error return the error code.
95 *
96 * Notes on the allocation algorithm
97 * =================================
98 *
99 * There are two data zones.  First is the area between the end of the mft zone
100 * and the end of the volume, and second is the area between the start of the
101 * volume and the start of the mft zone.  On unmodified/standard NTFS 1.x
102 * volumes, the second data zone does not exist due to the mft zone being
103 * expanded to cover the start of the volume in order to reserve space for the
104 * mft bitmap attribute.
105 *
106 * This is not the prettiest function but the complexity stems from the need of
107 * implementing the mft vs data zoned approach and from the fact that we have
108 * access to the lcn bitmap in portions of up to 8192 bytes at a time, so we
109 * need to cope with crossing over boundaries of two buffers.  Further, the
110 * fact that the allocator allows for caller supplied hints as to the location
111 * of where allocation should begin and the fact that the allocator keeps track
112 * of where in the data zones the next natural allocation should occur,
113 * contribute to the complexity of the function.  But it should all be
114 * worthwhile, because this allocator should: 1) be a full implementation of
115 * the MFT zone approach used by Windows NT, 2) cause reduction in
116 * fragmentation, and 3) be speedy in allocations (the code is not optimized
117 * for speed, but the algorithm is, so further speed improvements are probably
118 * possible).
119 *
120 * FIXME: We should be monitoring cluster allocation and increment the MFT zone
121 * size dynamically but this is something for the future.  We will just cause
122 * heavier fragmentation by not doing it and I am not even sure Windows would
123 * grow the MFT zone dynamically, so it might even be correct not to do this.
124 * The overhead in doing dynamic MFT zone expansion would be very large and
125 * unlikely worth the effort. (AIA)
126 *
127 * TODO: I have added in double the required zone position pointer wrap around
128 * logic which can be optimized to having only one of the two logic sets.
129 * However, having the double logic will work fine, but if we have only one of
130 * the sets and we get it wrong somewhere, then we get into trouble, so
131 * removing the duplicate logic requires _very_ careful consideration of _all_
132 * possible code paths.  So at least for now, I am leaving the double logic -
133 * better safe than sorry... (AIA)
134 *
135 * Locking: - The volume lcn bitmap must be unlocked on entry and is unlocked
136 *	      on return.
137 *	    - This function takes the volume lcn bitmap lock for writing and
138 *	      modifies the bitmap contents.
139 *	    - The lock of the runlist @runlist is not touched thus the caller
140 *	      is responsible for locking it for writing if needed.
141 */
142errno_t ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
143		const s64 count, const LCN start_lcn,
144		const NTFS_CLUSTER_ALLOCATION_ZONES zone,
145		const BOOL is_extension, ntfs_runlist *runlist)
146{
147	LCN zone_start, zone_end, bmp_pos, bmp_initial_pos, last_read_pos, lcn;
148	LCN prev_lcn = 0, prev_run_len = 0, mft_zone_size;
149	s64 clusters, data_size;
150	ntfs_inode *lcnbmp_ni;
151	ntfs_rl_element *rl = NULL;
152	upl_t upl = NULL;
153	upl_page_info_array_t pl;
154	u8 *b, *byte;
155	int rlpos, rlsize, bsize;
156	errno_t err;
157	u8 pass, done_zones, search_zone, bit;
158	BOOL need_writeback = FALSE;
159
160	ntfs_debug("Entering for start_vcn 0x%llx, count 0x%llx, start_lcn "
161			"0x%llx, zone %s_ZONE.", (unsigned long long)start_vcn,
162			(unsigned long long)count,
163			(unsigned long long)start_lcn,
164			zone == MFT_ZONE ? "MFT" : "DATA");
165	if (!vol)
166		panic("%s(): !vol\n", __FUNCTION__);
167	lcnbmp_ni = vol->lcnbmp_ni;
168	if (!lcnbmp_ni)
169		panic("%s(): !lcnbmp_ni\n", __FUNCTION__);
170	if (start_vcn < 0)
171		panic("%s(): start_vcn < 0\n", __FUNCTION__);
172	if (count < 0)
173		panic("%s(): count < 0\n", __FUNCTION__);
174	if (start_lcn < -1)
175		panic("%s(): start_lcn < -1\n", __FUNCTION__);
176	if (zone < FIRST_ZONE)
177		panic("%s(): zone < FIRST_ZONE\n", __FUNCTION__);
178	if (zone > LAST_ZONE)
179		panic("%s(): zone > LAST_ZONE\n", __FUNCTION__);
180	/* Return NULL if @count is zero. */
181	if (!count) {
182		if (runlist->alloc)
183			OSFree(runlist->rl, runlist->alloc, ntfs_malloc_tag);
184		runlist->rl = NULL;
185		runlist->elements = 0;
186		runlist->alloc = 0;
187		return 0;
188	}
189	/* Take the lcnbmp lock for writing. */
190	lck_rw_lock_exclusive(&vol->lcnbmp_lock);
191	err = vnode_get(lcnbmp_ni->vn);
192	if (err) {
193		ntfs_error(vol->mp, "Failed to get vnode for $Bitmap.");
194		lck_rw_unlock_exclusive(&vol->lcnbmp_lock);
195		return err;
196	}
197	lck_rw_lock_shared(&lcnbmp_ni->lock);
198	/*
199	 * If no specific @start_lcn was requested, use the current data zone
200	 * position, otherwise use the requested @start_lcn but make sure it
201	 * lies outside the mft zone.  Also set done_zones to 0 (no zones done)
202	 * and pass depending on whether we are starting inside a zone (1) or
203	 * at the beginning of a zone (2).  If requesting from the MFT_ZONE,
204	 * we either start at the current position within the mft zone or at
205	 * the specified position.  If the latter is out of bounds then we start
206	 * at the beginning of the MFT_ZONE.
207	 */
208	done_zones = 0;
209	pass = 1;
210	/*
211	 * zone_start and zone_end are the current search range.  search_zone
212	 * is 1 for mft zone, 2 for data zone 1 (end of mft zone till end of
213	 * volume) and 4 for data zone 2 (start of volume till start of mft
214	 * zone).
215	 */
216	zone_start = start_lcn;
217	if (zone_start < 0) {
218		if (zone == DATA_ZONE)
219			zone_start = vol->data1_zone_pos;
220		else
221			zone_start = vol->mft_zone_pos;
222		if (!zone_start) {
223			/*
224			 * Zone starts at beginning of volume which means a
225			 * single pass is sufficient.
226			 */
227			pass = 2;
228		}
229	} else if (zone == DATA_ZONE && zone_start >= vol->mft_zone_start &&
230			zone_start < vol->mft_zone_end) {
231		zone_start = vol->mft_zone_end;
232		/*
233		 * Starting at beginning of data1_zone which means a single
234		 * pass in this zone is sufficient.
235		 */
236		pass = 2;
237	} else if (zone == MFT_ZONE && (zone_start < vol->mft_zone_start ||
238			zone_start >= vol->mft_zone_end)) {
239		zone_start = vol->mft_lcn;
240		if (!vol->mft_zone_end)
241			zone_start = 0;
242		/*
243		 * Starting at beginning of volume which means a single pass
244		 * is sufficient.
245		 */
246		pass = 2;
247	}
248	if (zone == MFT_ZONE) {
249		zone_end = vol->mft_zone_end;
250		search_zone = 1;
251	} else /* if (zone == DATA_ZONE) */ {
252		/* Skip searching the mft zone. */
253		done_zones |= 1;
254		if (zone_start >= vol->mft_zone_end) {
255			zone_end = vol->nr_clusters;
256			search_zone = 2;
257		} else {
258			zone_end = vol->mft_zone_start;
259			search_zone = 4;
260		}
261	}
262	/*
263	 * bmp_pos is the current bit position inside the bitmap.  We use
264	 * bmp_initial_pos to determine whether or not to do a zone switch.
265	 */
266	bmp_pos = bmp_initial_pos = zone_start;
267	/* Loop until all clusters are allocated, i.e. clusters == 0. */
268	clusters = count;
269	rlpos = rlsize = 0;
270	lck_spin_lock(&lcnbmp_ni->size_lock);
271	data_size = ubc_getsize(lcnbmp_ni->vn);
272	if (data_size != lcnbmp_ni->data_size)
273		panic("%s(): data_size != lcnbmp_ni->data_size\n",
274				__FUNCTION__);
275	lck_spin_unlock(&lcnbmp_ni->size_lock);
276	while (1) {
277		ntfs_debug("Start of outer while loop: done_zones 0x%x, "
278				"search_zone %d, pass %d, zone_start 0x%llx, "
279				"zone_end 0x%llx, bmp_initial_pos 0x%llx, "
280				"bmp_pos 0x%llx, rlpos %d, rlsize %d.",
281				done_zones, search_zone, pass,
282				(unsigned long long)zone_start,
283				(unsigned long long)zone_end,
284				(unsigned long long)bmp_initial_pos,
285				(unsigned long long)bmp_pos, rlpos, rlsize);
286		/* Loop until we run out of free clusters. */
287		last_read_pos = bmp_pos >> 3;
288		ntfs_debug("last_read_pos 0x%llx.",
289				(unsigned long long)last_read_pos);
290		if (last_read_pos > data_size) {
291			ntfs_debug("End of attribute reached.  "
292					"Skipping to zone_pass_done.");
293			goto zone_pass_done;
294		}
295		if (upl) {
296			ntfs_page_unmap(lcnbmp_ni, upl, pl, need_writeback);
297			if (need_writeback) {
298				ntfs_debug("Marking page dirty.");
299				need_writeback = FALSE;
300			}
301		}
302		err = ntfs_page_map(lcnbmp_ni, last_read_pos & ~PAGE_MASK_64,
303				&upl, &pl, &b, TRUE);
304		if (err) {
305			ntfs_error(vol->mp, "Failed to map page.");
306			upl = NULL;
307			goto out;
308		}
309		bsize = last_read_pos & PAGE_MASK;
310		b += bsize;
311		bsize = PAGE_SIZE - bsize;
312		if (last_read_pos + bsize > data_size)
313			bsize = data_size - last_read_pos;
314		bsize <<= 3;
315		lcn = bmp_pos & 7;
316		bmp_pos &= ~(LCN)7;
317		ntfs_debug("Before inner while loop: bsize %d, lcn 0x%llx, "
318				"bmp_pos 0x%llx, need_writeback is %s.", bsize,
319				(unsigned long long)lcn,
320				(unsigned long long)bmp_pos,
321				need_writeback ? "true" : "false");
322		while (lcn < bsize && lcn + bmp_pos < zone_end) {
323			byte = b + (lcn >> 3);
324			ntfs_debug("In inner while loop: bsize %d, lcn "
325					"0x%llx, bmp_pos 0x%llx, "
326					"need_writeback is %s, byte ofs 0x%x, "
327					"*byte 0x%x.", bsize,
328					(unsigned long long)lcn,
329					(unsigned long long)bmp_pos,
330					need_writeback ? "true" : "false",
331					(unsigned)(lcn >> 3), (unsigned)*byte);
332			/* Skip full bytes. */
333			if (*byte == 0xff) {
334				lcn = (lcn + 8) & ~(LCN)7;
335				ntfs_debug("Continuing while loop 1.");
336				continue;
337			}
338			bit = 1 << (lcn & 7);
339			ntfs_debug("bit 0x%x.", bit);
340			/* If the bit is already set, go onto the next one. */
341			if (*byte & bit) {
342				lcn++;
343				ntfs_debug("Continuing while loop 2.");
344				continue;
345			}
346			/*
347			 * Allocate more memory if needed, including space for
348			 * the terminator element.
349			 * ntfs_malloc_nofs() operates on whole pages only.
350			 */
351			if ((rlpos + 2) * (int)sizeof(*rl) > rlsize) {
352				ntfs_rl_element *rl2;
353
354				ntfs_debug("Reallocating memory.");
355				rl2 = OSMalloc(rlsize + NTFS_ALLOC_BLOCK,
356						ntfs_malloc_tag);
357				if (!rl2) {
358					err = ENOMEM;
359					ntfs_error(vol->mp, "Failed to "
360							"allocate memory.");
361					goto out;
362				}
363				if (!rl)
364					ntfs_debug("First free bit is at LCN "
365							"0x%llx.",
366							(unsigned long long)
367							(lcn + bmp_pos));
368				else {
369					memcpy(rl2, rl, rlsize);
370					OSFree(rl, rlsize, ntfs_malloc_tag);
371				}
372				rl = rl2;
373				rlsize += NTFS_ALLOC_BLOCK;
374				ntfs_debug("Reallocated memory, rlsize 0x%x.",
375						rlsize);
376			}
377			/* Allocate the bitmap bit. */
378			*byte |= bit;
379			vol->nr_free_clusters--;
380			if (vol->nr_free_clusters < 0)
381				vol->nr_free_clusters = 0;
382			/* We need to write this bitmap page to disk. */
383			need_writeback = TRUE;
384			ntfs_debug("*byte 0x%x, need_writeback is set.",
385					(unsigned)*byte);
386			/*
387			 * Coalesce with previous run if adjacent LCNs.
388			 * Otherwise, append a new run.
389			 */
390			ntfs_debug("Adding run (lcn 0x%llx, len 0x%llx), "
391					"prev_lcn 0x%llx, lcn 0x%llx, "
392					"bmp_pos 0x%llx, prev_run_len 0x%llx, "
393					"rlpos %d.",
394					(unsigned long long)(lcn + bmp_pos),
395					1ULL, (unsigned long long)prev_lcn,
396					(unsigned long long)lcn,
397					(unsigned long long)bmp_pos,
398					(unsigned long long)prev_run_len,
399					rlpos);
400			if (prev_lcn == lcn + bmp_pos - prev_run_len && rlpos) {
401				ntfs_debug("Coalescing to run (lcn 0x%llx, "
402						"len 0x%llx).",
403						(unsigned long long)
404						rl[rlpos - 1].lcn,
405						(unsigned long long)
406						rl[rlpos - 1].length);
407				rl[rlpos - 1].length = ++prev_run_len;
408				ntfs_debug("Run now (lcn 0x%llx, len 0x%llx), "
409						"prev_run_len 0x%llx.",
410						(unsigned long long)
411						rl[rlpos - 1].lcn,
412						(unsigned long long)
413						rl[rlpos - 1].length,
414						(unsigned long long)
415						prev_run_len);
416			} else {
417				if (rlpos) {
418					ntfs_debug("Adding new run, (previous "
419							"run lcn 0x%llx, "
420							"len 0x%llx).",
421							(unsigned long long)
422							rl[rlpos - 1].lcn,
423							(unsigned long long)
424							rl[rlpos - 1].length);
425					rl[rlpos].vcn = rl[rlpos - 1].vcn +
426							prev_run_len;
427				} else {
428					ntfs_debug("Adding new run, is first "
429							"run.");
430					rl[rlpos].vcn = start_vcn;
431				}
432				rl[rlpos].lcn = prev_lcn = lcn + bmp_pos;
433				rl[rlpos].length = prev_run_len = 1;
434				rlpos++;
435			}
436			/* Done? */
437			if (!--clusters) {
438				LCN tc;
439				/*
440				 * Update the current zone position.  Positions
441				 * of already scanned zones have been updated
442				 * during the respective zone switches.
443				 */
444				tc = lcn + bmp_pos + 1;
445				ntfs_debug("Done. Updating current zone "
446						"position, tc 0x%llx, "
447						"search_zone %d.",
448						(unsigned long long)tc,
449						search_zone);
450				switch (search_zone) {
451				case 1:
452					ntfs_debug("Before checks, "
453							"vol->mft_zone_pos "
454							"0x%llx.",
455							(unsigned long long)
456							vol->mft_zone_pos);
457					if (tc >= vol->mft_zone_end) {
458						vol->mft_zone_pos =
459								vol->mft_lcn;
460						if (!vol->mft_zone_end)
461							vol->mft_zone_pos = 0;
462					} else if ((bmp_initial_pos >=
463							vol->mft_zone_pos ||
464							tc > vol->mft_zone_pos)
465							&& tc >= vol->mft_lcn)
466						vol->mft_zone_pos = tc;
467					ntfs_debug("After checks, "
468							"vol->mft_zone_pos "
469							"0x%llx.",
470							(unsigned long long)
471							vol->mft_zone_pos);
472					break;
473				case 2:
474					ntfs_debug("Before checks, "
475							"vol->data1_zone_pos "
476							"0x%llx.",
477							(unsigned long long)
478							vol->data1_zone_pos);
479					if (tc >= vol->nr_clusters)
480						vol->data1_zone_pos =
481							     vol->mft_zone_end;
482					else if ((bmp_initial_pos >=
483						    vol->data1_zone_pos ||
484						    tc > vol->data1_zone_pos)
485						    && tc >= vol->mft_zone_end)
486						vol->data1_zone_pos = tc;
487					ntfs_debug("After checks, "
488							"vol->data1_zone_pos "
489							"0x%llx.",
490							(unsigned long long)
491							vol->data1_zone_pos);
492					break;
493				case 4:
494					ntfs_debug("Before checks, "
495							"vol->data2_zone_pos "
496							"0x%llx.",
497							(unsigned long long)
498							vol->data2_zone_pos);
499					if (tc >= vol->mft_zone_start)
500						vol->data2_zone_pos = 0;
501					else if (bmp_initial_pos >=
502						      vol->data2_zone_pos ||
503						      tc > vol->data2_zone_pos)
504						vol->data2_zone_pos = tc;
505					ntfs_debug("After checks, "
506							"vol->data2_zone_pos "
507							"0x%llx.",
508							(unsigned long long)
509							vol->data2_zone_pos);
510					break;
511				default:
512					panic("%s(): Reached default case in "
513							"switch!\n",
514							__FUNCTION__);
515				}
516				ntfs_debug("Finished.  Going to out.");
517				goto out;
518			}
519			lcn++;
520		}
521		bmp_pos += bsize;
522		ntfs_debug("After inner while loop: bsize 0x%x, lcn 0x%llx, "
523				"bmp_pos 0x%llx, need_writeback is %s.", bsize,
524				(unsigned long long)lcn,
525				(unsigned long long)bmp_pos,
526				need_writeback ? "true" : "false");
527		if (bmp_pos < zone_end) {
528			ntfs_debug("Continuing outer while loop, "
529					"bmp_pos 0x%llx, zone_end 0x%llx.",
530					(unsigned long long)bmp_pos,
531					(unsigned long long)zone_end);
532			continue;
533		}
534zone_pass_done:	/* Finished with the current zone pass. */
535		ntfs_debug("At zone_pass_done, pass %d.", pass);
536		if (pass == 1) {
537			/*
538			 * Now do pass 2, scanning the first part of the zone
539			 * we omitted in pass 1.
540			 */
541			pass = 2;
542			zone_end = zone_start;
543			switch (search_zone) {
544			case 1: /* mft_zone */
545				zone_start = vol->mft_zone_start;
546				break;
547			case 2: /* data1_zone */
548				zone_start = vol->mft_zone_end;
549				break;
550			case 4: /* data2_zone */
551				zone_start = 0;
552				break;
553			default:
554				panic("%s(): Reached default case in switch "
555						"(2)!\n", __FUNCTION__);
556			}
557			/* Sanity check. */
558			if (zone_end < zone_start)
559				zone_end = zone_start;
560			bmp_pos = zone_start;
561			ntfs_debug("Continuing outer while loop, pass 2, "
562					"zone_start 0x%llx, zone_end 0x%llx, "
563					"bmp_pos 0x%llx.",
564					(unsigned long long)zone_start,
565					(unsigned long long)zone_end,
566					(unsigned long long)bmp_pos);
567			continue;
568		} /* pass == 2 */
569done_zones_check:
570		ntfs_debug("At done_zones_check, search_zone %d, done_zones "
571				"before 0x%x, done_zones after 0x%x.",
572				search_zone, done_zones,
573				done_zones | search_zone);
574		done_zones |= search_zone;
575		if (done_zones < 7) {
576			ntfs_debug("Switching zone.");
577			/* Now switch to the next zone we haven't done yet. */
578			pass = 1;
579			switch (search_zone) {
580			case 1:
581				ntfs_debug("Switching from mft zone to data1 "
582						"zone.");
583				/* Update mft zone position. */
584				if (rlpos) {
585					LCN tc;
586
587					ntfs_debug("Before checks, "
588							"vol->mft_zone_pos "
589							"0x%llx.",
590							(unsigned long long)
591							vol->mft_zone_pos);
592					tc = rl[rlpos - 1].lcn +
593							rl[rlpos - 1].length;
594					if (tc >= vol->mft_zone_end) {
595						vol->mft_zone_pos =
596								vol->mft_lcn;
597						if (!vol->mft_zone_end)
598							vol->mft_zone_pos = 0;
599					} else if ((bmp_initial_pos >=
600							vol->mft_zone_pos ||
601							tc > vol->mft_zone_pos)
602							&& tc >= vol->mft_lcn)
603						vol->mft_zone_pos = tc;
604					ntfs_debug("After checks, "
605							"vol->mft_zone_pos "
606							"0x%llx.",
607							(unsigned long long)
608							vol->mft_zone_pos);
609				}
610				/* Switch from mft zone to data1 zone. */
611switch_to_data1_zone:		search_zone = 2;
612				zone_start = bmp_initial_pos =
613						vol->data1_zone_pos;
614				zone_end = vol->nr_clusters;
615				if (zone_start == vol->mft_zone_end)
616					pass = 2;
617				if (zone_start >= zone_end) {
618					vol->data1_zone_pos = zone_start =
619							vol->mft_zone_end;
620					pass = 2;
621				}
622				break;
623			case 2:
624				ntfs_debug("Switching from data1 zone to "
625						"data2 zone.");
626				/* Update data1 zone position. */
627				if (rlpos) {
628					LCN tc;
629
630					ntfs_debug("Before checks, "
631							"vol->data1_zone_pos "
632							"0x%llx.",
633							(unsigned long long)
634							vol->data1_zone_pos);
635					tc = rl[rlpos - 1].lcn +
636							rl[rlpos - 1].length;
637					if (tc >= vol->nr_clusters)
638						vol->data1_zone_pos =
639							     vol->mft_zone_end;
640					else if ((bmp_initial_pos >=
641						    vol->data1_zone_pos ||
642						    tc > vol->data1_zone_pos)
643						    && tc >= vol->mft_zone_end)
644						vol->data1_zone_pos = tc;
645					ntfs_debug("After checks, "
646							"vol->data1_zone_pos "
647							"0x%llx.",
648							(unsigned long long)
649							vol->data1_zone_pos);
650				}
651				/* Switch from data1 zone to data2 zone. */
652				search_zone = 4;
653				zone_start = bmp_initial_pos =
654						vol->data2_zone_pos;
655				zone_end = vol->mft_zone_start;
656				if (!zone_start)
657					pass = 2;
658				if (zone_start >= zone_end) {
659					vol->data2_zone_pos = zone_start =
660							bmp_initial_pos = 0;
661					pass = 2;
662				}
663				break;
664			case 4:
665				ntfs_debug("Switching from data2 zone to "
666						"data1 zone.");
667				/* Update data2 zone position. */
668				if (rlpos) {
669					LCN tc;
670
671					ntfs_debug("Before checks, "
672							"vol->data2_zone_pos "
673							"0x%llx.",
674							(unsigned long long)
675							vol->data2_zone_pos);
676					tc = rl[rlpos - 1].lcn +
677							rl[rlpos - 1].length;
678					if (tc >= vol->mft_zone_start)
679						vol->data2_zone_pos = 0;
680					else if (bmp_initial_pos >=
681						      vol->data2_zone_pos ||
682						      tc > vol->data2_zone_pos)
683						vol->data2_zone_pos = tc;
684					ntfs_debug("After checks, "
685							"vol->data2_zone_pos "
686							"0x%llx.",
687							(unsigned long long)
688							vol->data2_zone_pos);
689				}
690				/* Switch from data2 zone to data1 zone. */
691				goto switch_to_data1_zone;
692			default:
693				panic("%s(): Reached default case in switch "
694						"(3)!\n", __FUNCTION__);
695			}
696			ntfs_debug("After zone switch, search_zone %d, "
697					"pass %d, bmp_initial_pos 0x%llx, "
698					"zone_start 0x%llx, zone_end 0x%llx.",
699					search_zone, pass,
700					(unsigned long long)bmp_initial_pos,
701					(unsigned long long)zone_start,
702					(unsigned long long)zone_end);
703			bmp_pos = zone_start;
704			if (zone_start == zone_end) {
705				ntfs_debug("Empty zone, going to "
706						"done_zones_check.");
707				/* Empty zone. Don't bother searching it. */
708				goto done_zones_check;
709			}
710			ntfs_debug("Continuing outer while loop.");
711			continue;
712		} /* done_zones == 7 */
713		ntfs_debug("All zones are finished.");
714		/*
715		 * All zones are finished!  If DATA_ZONE, shrink mft zone.  If
716		 * MFT_ZONE, we have really run out of space.
717		 */
718		mft_zone_size = vol->mft_zone_end - vol->mft_zone_start;
719		ntfs_debug("vol->mft_zone_start 0x%llx, vol->mft_zone_end "
720				"0x%llx, mft_zone_size 0x%llx.",
721				(unsigned long long)vol->mft_zone_start,
722				(unsigned long long)vol->mft_zone_end,
723				(unsigned long long)mft_zone_size);
724		if (zone == MFT_ZONE || mft_zone_size <= 0) {
725			ntfs_debug("No free clusters left, going to out.");
726			/* Really no more space left on device. */
727			err = ENOSPC;
728			goto out;
729		} /* zone == DATA_ZONE && mft_zone_size > 0 */
730		ntfs_debug("Shrinking mft zone.");
731		zone_end = vol->mft_zone_end;
732		mft_zone_size >>= 1;
733		if (mft_zone_size > 0)
734			vol->mft_zone_end = vol->mft_zone_start + mft_zone_size;
735		else /* mft zone and data2 zone no longer exist. */
736			vol->data2_zone_pos = vol->mft_zone_start =
737					vol->mft_zone_end = 0;
738		if (vol->mft_zone_pos >= vol->mft_zone_end) {
739			vol->mft_zone_pos = vol->mft_lcn;
740			if (!vol->mft_zone_end)
741				vol->mft_zone_pos = 0;
742		}
743		bmp_pos = zone_start = bmp_initial_pos =
744				vol->data1_zone_pos = vol->mft_zone_end;
745		search_zone = 2;
746		pass = 2;
747		done_zones &= ~2;
748		ntfs_debug("After shrinking mft zone, mft_zone_size 0x%llx, "
749				"vol->mft_zone_start 0x%llx, "
750				"vol->mft_zone_end 0x%llx, "
751				"vol->mft_zone_pos 0x%llx, search_zone 2, "
752				"pass 2, dones_zones 0x%x, zone_start 0x%llx, "
753				"zone_end 0x%llx, vol->data1_zone_pos 0x%llx, "
754				"continuing outer while loop.",
755				(unsigned long long)mft_zone_size,
756				(unsigned long long)vol->mft_zone_start,
757				(unsigned long long)vol->mft_zone_end,
758				(unsigned long long)vol->mft_zone_pos,
759				done_zones, (unsigned long long)zone_start,
760				(unsigned long long)zone_end,
761				(unsigned long long)vol->data1_zone_pos);
762	}
763	ntfs_debug("After outer while loop.");
764out:
765	ntfs_debug("At out.");
766	/* Add runlist terminator element. */
767	if (rl) {
768		rl[rlpos].vcn = rl[rlpos - 1].vcn + rl[rlpos - 1].length;
769		rl[rlpos].lcn = is_extension ? LCN_ENOENT : LCN_RL_NOT_MAPPED;
770		rl[rlpos].length = 0;
771	}
772	if (upl) {
773		ntfs_page_unmap(lcnbmp_ni, upl, pl, need_writeback);
774		if (need_writeback) {
775			ntfs_debug("Marking page dirty.");
776			need_writeback = FALSE;
777		}
778	}
779	if (!err) {
780		/*
781		 * We allocated new clusters thus we need to unmap any
782		 * underlying i/os that may be inprogress so our new data
783		 * cannot get trampled by old data from the previous owner of
784		 * the clusters.
785		 */
786		if (rl) {
787			ntfs_rl_element *trl;
788			vnode_t dev_vn = vol->dev_vn;
789			const u8 cluster_to_block_shift =
790					vol->cluster_size_shift -
791					vol->sector_size_shift;
792
793			/* Iterate over the runs in the allocated runlist. */
794			for (trl = rl; trl->length; trl++) {
795				daddr64_t block, end_block;
796
797				if (trl->lcn < 0)
798					continue;
799				/* Determine the starting block of this run. */
800				block = trl->lcn << cluster_to_block_shift;
801				/* Determine the last block of this run. */
802				end_block = block + (trl->length <<
803						cluster_to_block_shift);
804				/*
805				 * For each block in this run, invoke
806				 * buf_invalblkno() to ensure no i/o against
807				 * the block device can happen once we have
808				 * allocated the buffers for other purposes.
809				 *
810				 * FIXME:/TODO: buf_invalblkno() currently
811				 * aborts if msleep() returns an error so we
812				 * keep calling it until it returns success.
813				 */
814				for (; block < end_block; block++) {
815					do {
816						err = buf_invalblkno(dev_vn,
817								block,
818								BUF_WAIT);
819					} while (err);
820				}
821			}
822		}
823		lck_rw_unlock_shared(&lcnbmp_ni->lock);
824		(void)vnode_put(lcnbmp_ni->vn);
825		lck_rw_unlock_exclusive(&vol->lcnbmp_lock);
826		if (runlist->alloc)
827			OSFree(runlist->rl, runlist->alloc, ntfs_malloc_tag);
828		runlist->rl = rl;
829		runlist->elements = rlpos + 1;
830		runlist->alloc = rlsize;
831		ntfs_debug("Done.");
832		return 0;
833	}
834	ntfs_error(vol->mp, "Failed to allocate clusters, aborting (error "
835			"%d).", err);
836	if (rl) {
837		errno_t err2;
838
839		if (err == ENOSPC)
840			ntfs_debug("Not enough space to complete allocation, "
841					"err ENOSPC, first free lcn 0x%llx, "
842					"could allocate up to 0x%llx "
843					"clusters.",
844					(unsigned long long)rl[0].lcn,
845					(unsigned long long)(count - clusters));
846		/* Deallocate all allocated clusters. */
847		ntfs_debug("Attempting rollback...");
848		err2 = ntfs_cluster_free_from_rl_nolock(vol, rl, 0, -1, NULL);
849		if (err2) {
850			ntfs_error(vol->mp, "Failed to rollback (error %d).  "
851					"Leaving inconsistent metadata!  "
852					"Unmount and run chkdsk.", err2);
853			NVolSetErrors(vol);
854		}
855		/* Free the runlist. */
856		OSFree(rl, rlsize, ntfs_malloc_tag);
857	} else if (err == ENOSPC)
858		ntfs_debug("No space left at all, err ENOSPC, first free lcn "
859				"0x%llx.", (long long)vol->data1_zone_pos);
860	lck_rw_unlock_shared(&lcnbmp_ni->lock);
861	(void)vnode_put(lcnbmp_ni->vn);
862	lck_rw_unlock_exclusive(&vol->lcnbmp_lock);
863	return err;
864}
865
866/**
867 * ntfs_cluster_free_from_rl_nolock - free clusters from runlist
868 * @vol:	mounted ntfs volume on which to free the clusters
869 * @rl:		runlist describing the clusters to free
870 * @start_vcn:	vcn in the runlist @rl at which to start freeing clusters
871 * @count:	number of clusters to free or -1 for all clusters
872 * @nr_freed:	if not NULL return the number of real clusters freed
873 *
874 * Free @count clusters starting at the cluster @start_vcn in the runlist @rl
875 * on the volume @vol.  If @nr_freed is not NULL, *@nr_freed is set to the
876 * number of real clusters (i.e. not counting sparse clusters) that have been
877 * freed.
878 *
879 * If @count is -1, all clusters from @start_vcn to the end of the runlist are
880 * deallocated.  Thus, to completely free all clusters in a runlist, use
881 * @start_vcn = 0 and @count = -1.
882 *
883 * Note, ntfs_cluster_free_from_rl_nolock() does not modify the runlist, so you
884 * have to remove from the runlist or mark sparse the freed runs later.
885 *
886 * Return 0 on success and errno on error.  Note that if at least some clusters
887 * were freed success is always returned even though not all runs have been
888 * freed yet.  This does not matter as it just means that some clusters are
889 * lost until chkdsk is next run and as we schedule chkdsk to run at next boot
890 * this should happen soon.  We do emit a warning about this happenning
891 * however.
892 *
893 * Locking: - The volume lcn bitmap must be locked for writing on entry and is
894 *	      left locked on return.
895 *	    - The caller must have locked the runlist @rl for reading or
896 *	      writing.
897 *	    - The caller must have taken an iocount reference on the lcnbmp
898 *	      vnode.
899 */
900static errno_t ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
901		ntfs_rl_element *rl, const VCN start_vcn, s64 count,
902		s64 *nr_freed)
903{
904	s64 delta, to_free, real_freed;
905	ntfs_inode *lcnbmp_ni = vol->lcnbmp_ni;
906	errno_t err;
907
908	ntfs_debug("Entering for start_vcn 0x%llx, count 0x%llx.",
909			(unsigned long long)start_vcn,
910			(unsigned long long)count);
911	if (!lcnbmp_ni)
912		panic("%s(): !lcnbmp_ni\n", __FUNCTION__);
913	if (start_vcn < 0)
914		panic("%s(): start_vcn < 0\n", __FUNCTION__);
915	if (count < -1)
916		panic("%s(): count < -1\n", __FUNCTION__);
917	real_freed = 0;
918	if (nr_freed)
919		*nr_freed = 0;
920	rl = ntfs_rl_find_vcn_nolock(rl, start_vcn);
921	if (!rl)
922		return 0;
923	if (rl->lcn < LCN_HOLE) {
924		ntfs_error(vol->mp, "First runlist element has invalid lcn, "
925				"aborting.");
926		return EIO;
927	}
928	/* Find the starting cluster inside the run that needs freeing. */
929	delta = start_vcn - rl->vcn;
930	/* The number of clusters in this run that need freeing. */
931	to_free = rl->length - delta;
932	if (count >= 0 && to_free > count)
933		to_free = count;
934	if (rl->lcn >= 0) {
935		/* Do the actual freeing of the clusters in this run. */
936		err = ntfs_bitmap_clear_run(lcnbmp_ni, rl->lcn + delta,
937				to_free);
938		if (err) {
939			ntfs_error(vol->mp, "Failed to clear first run "
940					"(error %d), aborting.", err);
941			return err;
942		}
943		/* We have freed @to_free real clusters. */
944		real_freed = to_free;
945		vol->nr_free_clusters += to_free;
946		if (vol->nr_free_clusters > vol->nr_clusters)
947			vol->nr_free_clusters = vol->nr_clusters;
948	}
949	/* Go to the next run and adjust the number of clusters left to free. */
950	++rl;
951	if (count >= 0)
952		count -= to_free;
953	/*
954	 * Loop over the remaining runs, using @count as a capping value, and
955	 * free them.
956	 */
957	for (; rl->length && count; ++rl) {
958		if (rl->lcn < LCN_HOLE) {
959			ntfs_error(vol->mp, "Runlist element has invalid lcn "
960					"%lld at vcn 0x%llx, not freeing.  "
961					"Run chkdsk to recover the lost "
962					"space.", (long long)rl->lcn,
963					(unsigned long long)rl->vcn);
964			NVolSetErrors(vol);
965		}
966		/* The number of clusters in this run that need freeing. */
967		to_free = rl->length;
968		if (count >= 0 && to_free > count)
969			to_free = count;
970		if (rl->lcn >= 0) {
971			/* Do the actual freeing of the clusters in the run. */
972			err = ntfs_bitmap_clear_run(lcnbmp_ni, rl->lcn,
973					to_free);
974			if (err) {
975				ntfs_warning(vol->mp, "Failed to free "
976						"clusters in subsequent run.  "
977						"Run chkdsk to recover the "
978						"lost space.");
979				NVolSetErrors(vol);
980			} else {
981				vol->nr_free_clusters += to_free;
982				if (vol->nr_free_clusters > vol->nr_clusters)
983					vol->nr_free_clusters =
984							vol->nr_clusters;
985			}
986			/* We have freed @to_free real clusters. */
987			real_freed += to_free;
988		}
989		/* Adjust the number of clusters left to free. */
990		if (count >= 0)
991			count -= to_free;
992	}
993	if (count > 0)
994		panic("%s(): count > 0\n", __FUNCTION__);
995	/* We are done.  Return the number of actually freed clusters. */
996	ntfs_debug("Done.");
997	if (nr_freed)
998		*nr_freed = real_freed;
999	return 0;
1000}
1001
1002/**
1003 * ntfs_cluster_free_from_rl - free clusters from runlist
1004 * @vol:	mounted ntfs volume on which to free the clusters
1005 * @rl:		runlist describing the clusters to free
1006 * @start_vcn:	vcn in the runlist @rl at which to start freeing clusters
1007 * @count:	number of clusters to free or -1 for all clusters
1008 * @nr_freed:	if not NULL return the number of real clusters freed
1009 *
1010 * Free @count clusters starting at the cluster @start_vcn in the runlist @rl
1011 * on the volume @vol.  If @nr_freed is not NULL, *@nr_freed is set to the
1012 * number of real clusters (i.e. not counting sparse clusters) that have been
1013 * freed.
1014 *
1015 * If @count is -1, all clusters from @start_vcn to the end of the runlist are
1016 * deallocated.  Thus, to completely free all clusters in a runlist, use
1017 * @start_vcn = 0 and @count = -1.
1018 *
1019 * Note, ntfs_cluster_free_from_rl_nolock() does not modify the runlist, so you
1020 * have to remove from the runlist or mark sparse the freed runs later.
1021 *
1022 * Return 0 on success and errno on error.  Note that if at least some clusters
1023 * were freed success is always returned even though not all runs have been
1024 * freed yet.  This does not matter as it just means that some clusters are
1025 * lost until chkdsk is next run and as we schedule chkdsk to run at next boot
1026 * this should happen soon.  We do emit a warning about this happenning
1027 * however.
1028 *
1029 * Locking: - This function takes the volume lcn bitmap lock for writing and
1030 *	      modifies the bitmap contents.
1031 *	    - The caller must have locked the runlist @rl for reading or
1032 *	      writing.
1033 */
1034errno_t ntfs_cluster_free_from_rl(ntfs_volume *vol, ntfs_rl_element *rl,
1035		const VCN start_vcn, s64 count, s64 *nr_freed)
1036{
1037	ntfs_inode *lcnbmp_ni;
1038	vnode_t lcnbmp_vn;
1039	errno_t err;
1040
1041	lcnbmp_ni = vol->lcnbmp_ni;
1042	lcnbmp_vn = lcnbmp_ni->vn;
1043	lck_rw_lock_exclusive(&vol->lcnbmp_lock);
1044	err = vnode_get(lcnbmp_vn);
1045	if (!err) {
1046		lck_rw_lock_shared(&lcnbmp_ni->lock);
1047		err = ntfs_cluster_free_from_rl_nolock(vol, rl, start_vcn,
1048				count, nr_freed);
1049		lck_rw_unlock_shared(&lcnbmp_ni->lock);
1050		(void)vnode_put(lcnbmp_vn);
1051		lck_rw_unlock_exclusive(&vol->lcnbmp_lock);
1052		return err;
1053	}
1054	lck_rw_unlock_exclusive(&vol->lcnbmp_lock);
1055	ntfs_error(vol->mp, "Failed to get vnode for $Bitmap.");
1056	return err;
1057}
1058
1059/**
1060 * ntfs_cluster_free_nolock - free clusters on an ntfs volume
1061 * @ni:		ntfs inode whose runlist describes the clusters to free
1062 * @start_vcn:	vcn in the runlist of @ni at which to start freeing clusters
1063 * @count:	number of clusters to free or -1 for all clusters
1064 * @ctx:	active attribute search context if present or NULL if not
1065 * @nr_freed:	if not NULL return the number of real clusters freed
1066 * @is_rollback:	true if this is a rollback operation
1067 *
1068 * Free @count clusters starting at the cluster @start_vcn in the runlist
1069 * described by the ntfs inode @ni.  If @nr_freed is not NULL, *@nr_freed is
1070 * set to the number of real clusters (i.e. not counting sparse clusters) that
1071 * have been freed.
1072 *
1073 * If @count is -1, all clusters from @start_vcn to the end of the runlist are
1074 * deallocated.  Thus, to completely free all clusters in a runlist, use
1075 * @start_vcn = 0 and @count = -1.
1076 *
1077 * If @ctx is specified, it is an active search context of @ni and its base mft
1078 * record.  This is needed when ntfs_cluster_free_nolock() encounters unmapped
1079 * runlist fragments and allows their mapping.  If you do not have the mft
1080 * record mapped, you can specify @ctx as NULL and ntfs_cluster_free_nolock()
1081 * will perform the necessary mapping and unmapping.
1082 *
1083 * Note, ntfs_cluster_free_unlock() saves the state of @ctx on entry and
1084 * restores it before returning.  Thus, @ctx will be left pointing to the same
1085 * attribute on return as on entry.  However, the actual pointers in @ctx may
1086 * point to different memory locations on return, so you must remember to reset
1087 * any cached pointers from the @ctx, i.e. after the call to
1088 * ntfs_cluster_free_nolock(), you will probably want to do:
1089 *	m = ctx->m;
1090 *	a = ctx->a;
1091 * Assuming you cache ctx->a in a variable @a of type ATTR_RECORD * and that
1092 * you cache ctx->m in a variable @m of type MFT_RECORD *.
1093 *
1094 * @is_rollback should always be false, it is for internal use to rollback
1095 * errors.  You probably want to use ntfs_cluster_free() instead.
1096 *
1097 * Note, ntfs_cluster_free_nolock() does not modify the runlist, so you have to
1098 * remove from the runlist or mark sparse the freed runs later.
1099 *
1100 * Return 0 on success and errno on error.
1101 *
1102 * WARNING: If @ctx is supplied, regardless of whether success or failure is
1103 *	    returned, you need to check @ctx->is_error and if 1 the @ctx is no
1104 *	    longer valid, i.e. you need to either call
1105 *	    ntfs_attr_search_ctx_reinit() or ntfs_attr_search_ctx_put() on it.
1106 *	    In that case @ctx->error will give you the error code for why the
1107 *	    mapping of the old inode failed.
1108 *	    Also if @ctx is supplied and the current attribute (or the mft
1109 *	    record it is in) has been modified then the caller must call
1110 *	    NInoSetMrecNeedsDirtying(ctx->ni); before calling
1111 *	    ntfs_map_runlist_nolock() or the changes may be lost.
1112 *
1113 * Locking: - The volume lcn bitmap must be locked for writing on entry and is
1114 *	      left locked on return.
1115 *	    - The runlist described by @ni must be locked for writing on entry
1116 *	      and is locked on return.  Note the runlist may be modified when
1117 *	      needed runlist fragments need to be mapped.
1118 *	    - If @ctx is NULL, the base mft record of @ni must not be mapped on
1119 *	      entry and it will be left unmapped on return.
1120 *	    - If @ctx is not NULL, the base mft record must be mapped on entry
1121 *	      and it will be left mapped on return.
1122 *	    - The caller must have taken an iocount reference on the lcnbmp
1123 *	      vnode.
1124 */
1125static errno_t ntfs_cluster_free_nolock(ntfs_inode *ni, const VCN start_vcn,
1126		s64 count, ntfs_attr_search_ctx *ctx, s64 *nr_freed,
1127		const BOOL is_rollback)
1128{
1129	s64 delta, to_free, total_freed, real_freed;
1130	ntfs_volume *vol;
1131	ntfs_inode *lcnbmp_ni;
1132	ntfs_rl_element *rl;
1133	errno_t err, err2;
1134
1135	vol = ni->vol;
1136	lcnbmp_ni = vol->lcnbmp_ni;
1137	if (!lcnbmp_ni)
1138		panic("%s(): !lcnbmp_ni\n", __FUNCTION__);
1139	if (start_vcn < 0)
1140		panic("%s(): start_vcn < 0\n", __FUNCTION__);
1141	if (count < -1)
1142		panic("%s(): count < -1\n", __FUNCTION__);
1143	total_freed = real_freed = 0;
1144	if (nr_freed)
1145		*nr_freed = 0;
1146	err = ntfs_attr_find_vcn_nolock(ni, start_vcn, &rl, ctx);
1147	if (err) {
1148		if (!is_rollback)
1149			ntfs_error(vol->mp, "Failed to find first runlist "
1150					"element (error %d), aborting.", err);
1151		goto err;
1152	}
1153	if (rl->lcn < LCN_HOLE) {
1154		if (!is_rollback)
1155			ntfs_error(vol->mp, "First runlist element has "
1156					"invalid lcn, aborting.");
1157		err = EIO;
1158		goto err;
1159	}
1160	/* Find the starting cluster inside the run that needs freeing. */
1161	delta = start_vcn - rl->vcn;
1162	/* The number of clusters in this run that need freeing. */
1163	to_free = rl->length - delta;
1164	if (count >= 0 && to_free > count)
1165		to_free = count;
1166	if (rl->lcn >= 0) {
1167		/* Do the actual freeing of the clusters in this run. */
1168		err = ntfs_bitmap_set_bits_in_run(lcnbmp_ni, rl->lcn + delta,
1169				to_free, !is_rollback ? 0 : 1);
1170		if (err) {
1171			if (!is_rollback)
1172				ntfs_error(vol->mp, "Failed to clear first run "
1173						"(error %d), aborting.", err);
1174			goto err;
1175		}
1176		/* We have freed @to_free real clusters. */
1177		real_freed = to_free;
1178		if (is_rollback) {
1179			vol->nr_free_clusters -= to_free;
1180			if (vol->nr_free_clusters < 0)
1181				vol->nr_free_clusters = 0;
1182		} else {
1183			vol->nr_free_clusters += to_free;
1184			if (vol->nr_free_clusters > vol->nr_clusters)
1185				vol->nr_free_clusters = vol->nr_clusters;
1186		}
1187	}
1188	/* Go to the next run and adjust the number of clusters left to free. */
1189	++rl;
1190	if (count >= 0)
1191		count -= to_free;
1192	/* Keep track of the total "freed" clusters, including sparse ones. */
1193	total_freed = to_free;
1194	/*
1195	 * Loop over the remaining runs, using @count as a capping value, and
1196	 * free them.
1197	 */
1198	for (; count; ++rl) {
1199		if (rl->lcn < LCN_HOLE) {
1200			VCN vcn;
1201
1202			/*
1203			 * If we have reached the end of the runlist we are
1204			 * done.  We need this when @count is -1 so that we
1205			 * detect the end of the runlist.
1206			 */
1207			if (rl->lcn == LCN_ENOENT)
1208				break;
1209			/* Attempt to map runlist. */
1210			vcn = rl->vcn;
1211			err = ntfs_attr_find_vcn_nolock(ni, vcn, &rl, ctx);
1212			if (err) {
1213				/*
1214				 * If we have reached the end of the runlist we
1215				 * are done.  We need this when @count is -1 so
1216				 * that we detect the end of the runlist.
1217				 */
1218				if (err == ENOENT)
1219					break;
1220				if (!is_rollback)
1221					ntfs_error(vol->mp, "Failed to map "
1222							"runlist fragment or "
1223							"failed to find "
1224							"subsequent runlist "
1225							"element.");
1226				goto err;
1227			}
1228			if (rl->lcn < LCN_HOLE) {
1229				if (!is_rollback)
1230					ntfs_error(vol->mp, "Runlist element "
1231							"has invalid lcn "
1232							"(0x%llx).",
1233							(unsigned long long)
1234							rl->lcn);
1235				err = EIO;
1236				goto err;
1237			}
1238		}
1239		/* The number of clusters in this run that need freeing. */
1240		to_free = rl->length;
1241		if (count >= 0 && to_free > count)
1242			to_free = count;
1243		if (rl->lcn >= 0) {
1244			/* Do the actual freeing of the clusters in the run. */
1245			err = ntfs_bitmap_set_bits_in_run(lcnbmp_ni, rl->lcn,
1246					to_free, !is_rollback ? 0 : 1);
1247			if (err) {
1248				if (!is_rollback)
1249					ntfs_error(vol->mp, "Failed to clear "
1250							"subsequent run.");
1251				goto err;
1252			}
1253			/* We have freed @to_free real clusters. */
1254			real_freed += to_free;
1255			if (is_rollback) {
1256				vol->nr_free_clusters -= to_free;
1257				if (vol->nr_free_clusters < 0)
1258					vol->nr_free_clusters = 0;
1259			} else {
1260				vol->nr_free_clusters += to_free;
1261				if (vol->nr_free_clusters > vol->nr_clusters)
1262					vol->nr_free_clusters =
1263							vol->nr_clusters;
1264			}
1265		}
1266		/* Adjust the number of clusters left to free. */
1267		if (count >= 0)
1268			count -= to_free;
1269		/* Update the total done clusters. */
1270		total_freed += to_free;
1271	}
1272	if (count > 0)
1273		panic("%s(): count > 0\n", __FUNCTION__);
1274	/* We are done.  Return the number of actually freed clusters. */
1275	ntfs_debug("Done.");
1276	if (nr_freed)
1277		*nr_freed = real_freed;
1278	return 0;
1279err:
1280	if (is_rollback)
1281		return err;
1282	/* If no real clusters were freed, no need to rollback. */
1283	if (!real_freed)
1284		return err;
1285	/*
1286	 * Attempt to rollback and if that succeeds just return the error code.
1287	 * If rollback fails, set the volume errors flag, emit an error
1288	 * message, and return the error code.
1289	 */
1290	err2 = ntfs_cluster_free_nolock(ni, start_vcn, total_freed, ctx, NULL,
1291			TRUE);
1292	if (err2) {
1293		ntfs_error(vol->mp, "Failed to rollback (error %d).  Leaving "
1294				"inconsistent metadata!  Unmount and run "
1295				"chkdsk.", err2);
1296		NVolSetErrors(vol);
1297	}
1298	ntfs_error(vol->mp, "Aborting (error %d).", err);
1299	return err;
1300}
1301
1302/**
1303 * ntfs_cluster_free - free clusters on an ntfs volume
1304 * @ni:		ntfs inode whose runlist describes the clusters to free
1305 * @start_vcn:	vcn in the runlist of @ni at which to start freeing clusters
1306 * @count:	number of clusters to free or -1 for all clusters
1307 * @ctx:	active attribute search context if present or NULL if not
1308 * @nr_freed:	if not NULL return the number of real clusters freed
1309 *
1310 * Free @count clusters starting at the cluster @start_vcn in the runlist
1311 * described by the ntfs inode @ni.  If @nr_freed is not NULL, *@nr_freed is
1312 * set to the number of real clusters (i.e. not counting sparse clusters) that
1313 * have been freed.
1314 *
1315 * If @count is -1, all clusters from @start_vcn to the end of the runlist are
1316 * deallocated.  Thus, to completely free all clusters in a runlist, use
1317 * @start_vcn = 0 and @count = -1.
1318 *
1319 * If @ctx is specified, it is an active search context of @ni and its base mft
1320 * record.  This is needed when ntfs_cluster_free() encounters unmapped runlist
1321 * fragments and allows their mapping.  If you do not have the mft record
1322 * mapped, you can specify @ctx as NULL and ntfs_cluster_free() will perform
1323 * the necessary mapping and unmapping.
1324 *
1325 * Note, ntfs_cluster_free() saves the state of @ctx on entry and restores it
1326 * before returning.  Thus, @ctx will be left pointing to the same attribute on
1327 * return as on entry.  However, the actual pointers in @ctx may point to
1328 * different memory locations on return, so you must remember to reset any
1329 * cached pointers from the @ctx, i.e. after the call to ntfs_cluster_free(),
1330 * you will probably want to do:
1331 *	m = ctx->m;
1332 *	a = ctx->a;
1333 * Assuming you cache ctx->a in a variable @a of type ATTR_RECORD * and that
1334 * you cache ctx->m in a variable @m of type MFT_RECORD *.
1335 *
1336 * Note, ntfs_cluster_free() does not modify the runlist, so you have to remove
1337 * from the runlist or mark sparse the freed runs later.
1338 *
1339 * Return 0 on success and errno on error.
1340 *
1341 * WARNING: If @ctx is supplied, regardless of whether success or failure is
1342 *	    returned, you need to check @ctx->is_error and if 1 the @ctx is no
1343 *	    longer valid, i.e. you need to either call
1344 *	    ntfs_attr_search_ctx_reinit() or ntfs_attr_search_ctx_put() on it.
1345 *	    In that case @ctx->error will give you the error code for why the
1346 *	    mapping of the old inode failed.
1347 *	    Also if @ctx is supplied and the current attribute (or the mft
1348 *	    record it is in) has been modified then the caller must call
1349 *	    NInoSetMrecNeedsDirtying(ctx->ni); before calling
1350 *	    ntfs_map_runlist_nolock() or the changes may be lost.
1351 *
1352 * Locking: - The runlist described by @ni must be locked for writing on entry
1353 *	      and is locked on return.  Note the runlist may be modified when
1354 *	      needed runlist fragments need to be mapped.
1355 *	    - The volume lcn bitmap must be unlocked on entry and is unlocked
1356 *	      on return.
1357 *	    - This function takes the volume lcn bitmap lock for writing and
1358 *	      modifies the bitmap contents.
1359 *	    - If @ctx is NULL, the base mft record of @ni must not be mapped on
1360 *	      entry and it will be left unmapped on return.
1361 *	    - If @ctx is not NULL, the base mft record must be mapped on entry
1362 *	      and it will be left mapped on return.
1363 */
1364errno_t ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count,
1365		ntfs_attr_search_ctx *ctx, s64 *nr_freed)
1366{
1367	ntfs_volume *vol;
1368	ntfs_inode *lcnbmp_ni;
1369	vnode_t lcnbmp_vn;
1370	errno_t err;
1371
1372	if (!ni)
1373		panic("%s(): !ni\n", __FUNCTION__);
1374	ntfs_debug("Entering for mft_no 0x%llx, start_vcn 0x%llx, count "
1375			"0x%llx.", (unsigned long long)ni->mft_no,
1376			(unsigned long long)start_vcn,
1377			(unsigned long long)count);
1378	vol = ni->vol;
1379	lcnbmp_ni = vol->lcnbmp_ni;
1380	lcnbmp_vn = lcnbmp_ni->vn;
1381	lck_rw_lock_exclusive(&vol->lcnbmp_lock);
1382	err = vnode_get(lcnbmp_vn);
1383	if (!err) {
1384		lck_rw_lock_shared(&lcnbmp_ni->lock);
1385		err = ntfs_cluster_free_nolock(ni, start_vcn, count, ctx,
1386				nr_freed, FALSE);
1387		lck_rw_unlock_shared(&lcnbmp_ni->lock);
1388		(void)vnode_put(lcnbmp_vn);
1389		lck_rw_unlock_exclusive(&vol->lcnbmp_lock);
1390		return err;
1391	}
1392	lck_rw_unlock_exclusive(&vol->lcnbmp_lock);
1393	ntfs_error(vol->mp, "Failed to get vnode for $Bitmap.");
1394	return err;
1395}
1396