1/*
2 * ntfs_compress.c - NTFS kernel compressed attribute operations.
3 *
4 * Copyright (c) 2006-2008 Anton Altaparmakov.  All Rights Reserved.
5 * Portions Copyright (c) 2006-2008 Apple Inc.  All Rights Reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 *    this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 * 3. Neither the name of Apple Inc. ("Apple") nor the names of its
16 *    contributors may be used to endorse or promote products derived from this
17 *    software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
20 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
23 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
26 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 *
30 * ALTERNATIVELY, provided that this notice and licensing terms are retained in
31 * full, this file may be redistributed and/or modified under the terms of the
32 * GNU General Public License (GPL) Version 2, in which case the provisions of
33 * that version of the GPL will apply to you instead of the license terms
34 * above.  You can obtain a copy of the GPL Version 2 at
35 * http://developer.apple.com/opensource/licenses/gpl-2.txt.
36 */
37
38#include <sys/errno.h>
39#include <sys/ucred.h>
40#include <sys/ubc.h>
41#include <sys/uio.h>
42#include <sys/types.h>
43
44#include <mach/memory_object_types.h>
45
46#include <string.h>
47
48#include <libkern/OSMalloc.h>
49
50#include <kern/debug.h>
51#include <kern/locks.h>
52
53#include "ntfs.h"
54#include "ntfs_attr.h"
55#include "ntfs_compress.h"
56#include "ntfs_debug.h"
57#include "ntfs_inode.h"
58#include "ntfs_runlist.h"
59#include "ntfs_types.h"
60#include "ntfs_volume.h"
61
62/**
63 * Compression related constants.
64 */
65enum {
66	/* Compression block (cb) types. */
67	NTFS_CB_SPARSE		= -1,
68	NTFS_CB_COMPRESSED	= -2,
69	NTFS_CB_UNCOMPRESSED	= -3,
70
71	/* Compression sub-block (sb) constants. */
72	NTFS_SB_SIZE_MASK	= 0x0fff,
73	NTFS_SB_SIZE		= 0x1000,
74	NTFS_SB_IS_COMPRESSED	= 0x8000,
75
76	/* Token types and access mask. */
77	NTFS_SYMBOL_TOKEN	= 0,
78	NTFS_PHRASE_TOKEN	= 1,
79	NTFS_TOKEN_MASK		= 1,
80};
81
82/**
83 * ntfs_get_cb_type - determine the type of a compression block
84 * @ni:		raw ntfs inode to which the compression block belongs
85 * @ofs:	byte offset of start of compression block
86 *
87 * Determine whether the compression block is sparse, compressed, or
88 * uncompressed by looking at the runlist.
89 *
90 * If the first cluster in the compression block is sparse the whole
91 * compression is sparse.  In that case we return NTFS_CB_SPARSE.
92 *
93 * If the last cluster in the compression block is sparse the compression block
94 * is compressed.  In that case we return NTFS_CB_COMPRESSED.
95 *
96 * If the whole compression block is backed by real clusters it is not
97 * compressed.  In that case we return NTFS_CB_UNCOMPRESSED.
98 *
99 * Return the compression block type (< 0) and errno (>= 0) on error.
100 */
101static inline int ntfs_get_cb_type(ntfs_inode *ni, s64 ofs)
102{
103	VCN start_vcn, vcn, end_vcn;
104	LCN lcn;
105	s64 clusters;
106	ntfs_rl_element *rl;
107	int ret = EIO;
108#ifdef DEBUG
109	const char *cb_type_str[3] = { "sparse", "compressed", "uncompressed" };
110#endif /* DEBUG */
111	BOOL is_retry, write_locked;
112
113	ntfs_debug("Entering for compressed file inode 0x%llx, offset 0x%llx.",
114			(unsigned long long)ni->mft_no,
115			(unsigned long long)ofs);
116	vcn = start_vcn = ofs >> ni->vol->cluster_size_shift;
117	end_vcn = start_vcn + ni->compression_block_clusters;
118	write_locked = is_retry = FALSE;
119	lck_rw_lock_shared(&ni->rl.lock);
120retry_remap:
121	rl = ni->rl.rl;
122	if (ni->rl.elements) {
123next_vcn:
124		/* Seek to element containing target vcn. */
125		while (rl->length && rl[1].vcn <= vcn)
126			rl++;
127		lcn = ntfs_rl_vcn_to_lcn(rl, vcn, &clusters);
128		/*
129		 * If we found a hole we are done.  If we were looking up the
130		 * starting vcn the entire compression block must be sparse.
131		 * Otherwise only part of the compression block is sparse, thus
132		 * it is compressed.
133		 */
134		if (lcn == LCN_HOLE) {
135			ret = NTFS_CB_COMPRESSED;
136			if (vcn == start_vcn)
137				ret = NTFS_CB_SPARSE;
138			goto done;
139		}
140		/*
141		 * If we found a real cluster allocation and it covers the end
142		 * of the compression block the compression block is not
143		 * compressed.  Otherwise we need to look at the last cluster
144		 * in the compression block.  Note there is no need to look at
145		 * the intervening clusters because it is not possible to have
146		 * sparse clusters in the middle of a compression block but not
147		 * at its end and neither is it possible to have a partial
148		 * compression block at the end of the attribute.
149		 */
150		if (lcn >= 0) {
151			if (vcn + clusters >= end_vcn) {
152				ret = NTFS_CB_UNCOMPRESSED;
153				goto done;
154			}
155			vcn = end_vcn - 1;
156			is_retry = FALSE;
157			goto next_vcn;
158		}
159	} else
160		lcn = LCN_RL_NOT_MAPPED;
161	/* The runlist is not mapped or an error occured. */
162	if (!write_locked) {
163		write_locked = TRUE;
164		if (!lck_rw_lock_shared_to_exclusive(&ni->rl.lock)) {
165			lck_rw_lock_exclusive(&ni->rl.lock);
166			goto retry_remap;
167		}
168	}
169	if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
170		ret = ntfs_map_runlist_nolock(ni, vcn, NULL);
171		if (!ret) {
172			is_retry = TRUE;
173			goto retry_remap;
174		}
175	} else if (!ret)
176		ret = EIO;
177done:
178	if (write_locked)
179		lck_rw_unlock_exclusive(&ni->rl.lock);
180	else
181		lck_rw_unlock_shared(&ni->rl.lock);
182	if (ret < 0)
183		ntfs_debug("Done (compression block is %s).",
184				cb_type_str[-ret - 1]);
185	else
186		ntfs_error(ni->vol->mp, "Failed (error %d, lcn %lld).", ret,
187				(long long)lcn);
188	return ret;
189}
190
191/**
192 * ntfs_decompress - decompress a compression block into a destination buffer
193 *
194 * Decompress the compression block @cb_start of size @cb_size into the
195 * destination buffer @dst_start.
196 *
197 * If @pl is not NULL, i.e. a page list is present, skip over any valid pages
198 * in the destination buffer.
199 *
200 * Return 0 on success and errno on error.
201 *
202 * The decompression algorithm:
203 *
204 * Compressed data is organized in logical "compression" blocks (cb).  Each cb
205 * has a size (cb_size) of 2^compression_unit clusters.  In all versions of
206 * Windows, NTFS (NT/2k/XP, NTFS 1.2-3.1), the only valid compression_unit is
207 * 4, IOW, each cb is 2^4 = 16 clusters in size.
208 *
209 * Compression is only supported for cluster sizes between 512 and 4096. Thus a
210 * cb can be between 8 and 64kiB in size.
211 *
212 * Each cb is independent of the other cbs and is thus the minimal unit we have
213 * to parse even if we wanted to decompress only one byte.
214 *
215 * Also, a cb can be totally uncompressed and this would be indicated as a
216 * sparse cb in the runlist.
217 *
218 * Thus, we need to look at the runlist of the compressed data stream, starting
219 * at the beginning of the first cb overlapping @page. So we convert the page
220 * offset into units of clusters (vcn), and round the vcn down to a mutliple of
221 * cb_size clusters.
222 *
223 * We then scan the runlist for the appropriate position. Based on what we find
224 * there, we decide how to proceed.
225 *
226 * If the cb is not compressed at all, we copy the uncompressed data over from
227 * the compressed data.
228 *
229 * If the cb is completely sparse, we just zero out the destination.
230 *
231 * In all other cases we initiate the decompression engine, but first some more
232 * on the compression algorithm.
233 *
234 * Before compression the data of each cb is further divided into 4kiB blocks,
235 * we call them "sub compression" blocks (sb), each including a header
236 * specifying its compressed length.  So we could just scan the cb for the
237 * first sb overlapping page and skip the sbs before that, or we could
238 * decompress the whole cb injecting the superfluous decompressed pages into
239 * the page cache as a form of read ahead (this is what we do when invoked via
240 * VNOP_READ()).
241 *
242 * In either case, we then need to read and decompress all sbs overlapping the
243 * destination, potentially having to decompress one or more other cbs, too.
244 *
245 * Because the sbs follow each other directly, we need to actually read in the
246 * whole cb in order to be able to scan through the cb to find the first sb
247 * overlapping the destination.
248 *
249 * So, we read the whole cb from disk and start at the first sb.
250 *
251 * As mentioned above, each sb is started with a header.  The header is 16 bits
252 * of which the lower twelve bits (i.e. bits 0 to 11) are the length (L) - 3 of
253 * the sb (including the two bytes for the header itself, or L - 1 not counting
254 * the two bytes for the header).  The higher four bits are set to 1011 (0xb)
255 * by the compressor for a compressed block, or to 0000 for an uncompressed
256 * block, but the decompressor only checks the most significant bit taking a 1
257 * to signify a compressed block, and a 0 an uncompressed block.
258 *
259 * So from the header we know how many compressed bytes we need to decompress to
260 * obtain the next 4kiB of uncompressed data and if we did not want to
261 * decompress this sb we could just seek to the next next one using the length
262 * read from the header.  We could then continue seeking until we reach the
263 * first sb overlapping the destination.
264 *
265 * In either case, we will reach a sb which we want to decompress.
266 *
267 * Having dealt with the 16-bit header of the sb, we now have length bytes of
268 * compressed data to decompress.  This compressed stream is further split into
269 * tokens which are organized into groups of eight tokens.  Each token group
270 * (tg) starts with a tag byte, which is an eight bit bitmap, the bits
271 * specifying the type of each of the following eight tokens.  The least
272 * significant bit (LSB) corresponds to the first token and the most
273 * significant bit (MSB) corresponds to the last token.
274 *
275 * The two types of tokens are symbol tokens, specified by a zero bit, and
276 * phrase tokens, specified by a set bit.
277 *
278 * A symbol token (st) is a single byte and is to be taken literally and copied
279 * into the sliding window (the decompressed data).
280 *
281 * A phrase token (pt) is a pointer back into the sliding window (in bytes),
282 * together with a length (again in bytes), starting at the byte the back
283 * pointer is pointing to.  Thus a phrase token defines a sequence of bytes in
284 * the sliding window which need to be copied at the current position into the
285 * sliding window (the decompressed data stream).
286 *
287 * Each pt consists of 2 bytes split into the back pointer (p) and the length
288 * (l), each of variable bit width (but the sum of the widths of p and l is
289 * fixed at 16 bits).  p is at least 4 bits and l is at most 12 bits.
290 *
291 * The most significant bits contain the back pointer (p), while the least
292 * significant bits contain the length (l).
293 *
294 * l is actually stored as the number of bytes minus 3 (unsigned) as anything
295 * shorter than that would be at least as long as the 2 bytes needed for the
296 * actual pt, so no compression would be achieved.
297 *
298 * p is stored as the positive number of bytes minus 1 (unsigned) as going zero
299 * bytes back is meaningless.
300 *
301 * Note that decompression has to occur byte by byte, as it is possible that
302 * some of the bytes pointed to by the pt will only be generated in the sliding
303 * window as the byte sequence pointed to by the pt is being copied into it!
304 *
305 * To give a concrete example: a block full of the letter A would be compressed
306 * by storing the byte A once as a symbol token, followed by a single phrase
307 * token with back pointer -1 (p = 0, therefore go back by -(0 + 1) bytes) and
308 * length 4095 (l=0xffc, therefore length 0xffc + 3 bytes).
309 *
310 * The widths of p and l are determined from the current position within the
311 * decompressed data (dst).  We do not actually care about the widths as such
312 * however, but instead we want the mask (l_mask) with which to AND the pt to
313 * obtain l, and the number of bits (p_shift) by which to right shift the pt to
314 * obtain p.  These are determined using the following algorithm:
315 *
316 * for (i = cur_pos, l_mask = 0xfff, p_shift = 12; i >= 0x10; i >>= 1) {
317 *	l_mask >>= 1;
318 *	p_shift--;
319 * }
320 *
321 * The above is the conventional algorithm.  As an optimization we actually use
322 * a different algorithm as this offers O(1) performance instead of O(n) of the
323 * above conventional algorithm.  Our optimized algorithm first calculates
324 * log2(current destination position in sb) and then uses that to derive p and
325 * l without having to iterate.  We just need an arch-optimized log2() function
326 * now to make it really fast as we for now still have a small loop which we
327 * need to determine the log2().  See the below code for details.
328 *
329 * Note, that as usual in NTFS, the sb header, as well as each pt, are stored
330 * in little endian format.
331 */
332static inline errno_t ntfs_decompress(ntfs_volume *vol, u8 *dst_start,
333		const int dst_ofs_in_cb, int dst_size, u8 *const cb_start,
334		const int cb_size, upl_page_info_t *pl, int cur_pg,
335		const int pages_per_cb)
336{
337	/*
338	 * Pointers into the compressed data, i.e. the compression block (cb),
339	 * and the therein contained sub-blocks (sb).
340	 */
341	u8 *cb;			/* Current position in compression block. */
342	u8 *cb_end;		/* End of compression block. */
343	u8 *cb_sb_start;	/* Beginning of the current sb in the cb. */
344	u8 *cb_sb_end;		/* End of current sb / beginning of next sb. */
345	/* Variables for uncompressed data / destination buffer. */
346	u8 *dst;		/* Current position in destination. */
347	u8 *dst_end;		/* End of destination buffer. */
348	u8 *dst_sb_start;	/* Start of current sub-block in destination. */
349	u8 *dst_sb_end;		/* End of current sub-block in destination. */
350	/* Variables for tag and token parsing. */
351	u8 tag;			/* Current tag. */
352	unsigned token;		/* Loop counter for the eight tokens in tag. */
353	unsigned skip_sbs;
354	BOOL skip_valid_pages;
355
356	ntfs_debug("Entering, compression block size 0x%x bytes.", cb_size);
357	/* Do we need to test for and skip valid pages in the destination? */
358	skip_valid_pages = FALSE;
359	if (pl && pages_per_cb > 1)
360		skip_valid_pages = TRUE;
361	/*
362	 * Do we need to skip any sub-blocks because the destination buffer
363	 * does not begin at the beginning of the compression block?
364	 */
365	skip_sbs = 0;
366	if (dst_ofs_in_cb)
367		skip_sbs = dst_ofs_in_cb / NTFS_SB_SIZE;
368	cb = cb_start;
369	cb_end = cb_start + cb_size;
370	dst = dst_start;
371	dst_end = dst + dst_size;
372next_sb:
373	ntfs_debug("Beginning sub-block at offset 0x%lx in the compression "
374			"block.", (unsigned long)(cb - cb_start));
375	/*
376	 * Have we reached the end of the compression block or the end of the
377	 * decompressed data?  The latter can happen for example if the current
378	 * position in the compression block is one byte before its end so the
379	 * first two checks do not detect it.
380	 */
381	if (cb == cb_end || !le16_to_cpup((le16*)cb) || dst == dst_end) {
382		ntfs_debug("Done.");
383		return 0;
384	}
385	/* Setup offsets for the current sub-block destination. */
386	dst_sb_start = dst;
387	dst_sb_end = dst + NTFS_SB_SIZE;
388	/* Check that we are still within allowed boundaries. */
389	if (dst_sb_end > dst_end)
390		goto err;
391	/* Does the minimum size of a compressed sb overflow valid range? */
392	if (cb + 6 > cb_end)
393		goto err;
394	/* Setup the current sub-block source pointers and validate range. */
395	cb_sb_start = cb;
396	cb_sb_end = cb + (le16_to_cpup((le16*)cb) & NTFS_SB_SIZE_MASK) + 3;
397	if (cb_sb_end > cb_end)
398		goto err;
399	/*
400	 * If the destination buffer does not start at the beginning of the
401	 * compression block, skip sub-blocks until we reach the beginning of
402	 * the destination buffer.
403	 */
404	if (skip_sbs) {
405		skip_sbs--;
406		/* Advance source position to next sub-block. */
407		cb = cb_sb_end;
408		goto next_sb;
409	}
410	/*
411	 * If the destination page corresponding to this sub-block is valid,
412	 * skip the sub-block.
413	 */
414	if (skip_valid_pages) {
415		BOOL skip_sb;
416
417		skip_sb = upl_valid_page(pl, cur_pg);
418		/*
419		 * Advance current page if the destination pointer is going to
420		 * cross a page boundary.  Doing this here unconditionally
421		 * means we do not need to advance it later on when switching
422		 * to the next sub-block thus it saves us one test for
423		 * @skip_valid_pages.
424		 */
425		if (!((dst - dst_start + NTFS_SB_SIZE) & PAGE_MASK))
426			cur_pg++;
427		if (skip_sb) {
428			/* Advance position to next sub-block. */
429			cb = cb_sb_end;
430			dst = dst_sb_end;
431			goto next_sb;
432		}
433	}
434	/* Now, we are ready to process the current sub-block. */
435	if (!(le16_to_cpup((le16*)cb) & NTFS_SB_IS_COMPRESSED)) {
436		/*
437		 * This sb is not compressed, just copy its data into the
438		 * destination buffer.
439		 */
440		ntfs_debug("Found uncompressed sub-block.");
441		/* Advance source position to first data byte. */
442		cb += 2;
443		/* An uncompressed sb must be full size. */
444		if (cb_sb_end - cb != NTFS_SB_SIZE)
445			goto err;
446		/* Copy the sub-block data. */
447		memcpy(dst, cb, NTFS_SB_SIZE);
448		/* Advance position to next sub-block. */
449		cb = cb_sb_end;
450		dst = dst_sb_end;
451		goto next_sb;
452	}
453	/* This sb is compressed, decompress it into the destination buffer. */
454	ntfs_debug("Found compressed sub-block.");
455	/* Forward to the first tag in the sub-block. */
456	cb += 2;
457next_tag:
458	if (cb == cb_sb_end) {
459		/* Check if the decompressed sub-block was not full-length. */
460		if (dst < dst_sb_end) {
461			ntfs_debug("Filling incomplete sub-block with "
462					"zeroes.");
463			/* Zero remainder and update destination position. */
464			bzero(dst, dst_sb_end - dst);
465			dst = dst_sb_end;
466		}
467		/* We have finished the current sub-block. */
468		goto next_sb;
469	}
470	/* Check we are still in range. */
471	if (cb > cb_sb_end || dst > dst_sb_end)
472		goto err;
473	/* Get the next tag and advance to first token. */
474	tag = *cb++;
475	/* Parse the eight tokens described by the tag. */
476	for (token = 0; token < 8; token++, tag >>= 1) {
477		unsigned lg, u, pt, length, max_non_overlap;
478		u8 *dst_back_addr;
479
480		/* Check if we are done / still in range. */
481		if (cb >= cb_sb_end || dst > dst_sb_end)
482			break;
483		/* Determine token type and parse appropriately.*/
484		if ((tag & NTFS_TOKEN_MASK) == NTFS_SYMBOL_TOKEN) {
485			/*
486			 * We have a symbol token, copy the symbol across, and
487			 * advance the source and destination positions.
488			 */
489			*dst++ = *cb++;
490			/* Continue with the next token. */
491			continue;
492		}
493		/*
494		 * We have a phrase token.  Make sure it is not the first tag
495		 * in the sub-block as this is illegal and would confuse the
496		 * code below.
497		 */
498		if (dst == dst_sb_start)
499			goto err;
500		/*
501		 * Determine the number of bytes to go back (p) and the number
502		 * of bytes to copy (l).  We use an optimized algorithm in
503		 * which we first calculate log2(current destination position
504		 * in sb), which allows determination of l and p in O(1) rather
505		 * than O(n).  We just need an arch-optimized log2() function
506		 * now.
507		 */
508		lg = 0;
509		for (u = dst - dst_sb_start - 1; u >= 0x10; u >>= 1)
510			lg++;
511		/* Get the phrase token. */
512		pt = le16_to_cpup((u16*)cb);
513		/*
514		 * Calculate starting position of the byte sequence in the
515		 * destination using the fact that p = (pt >> (12 - lg)) + 1
516		 * and make sure we do not go too far back.
517		 */
518		dst_back_addr = dst - (pt >> (12 - lg)) - 1;
519		if (dst_back_addr < dst_sb_start)
520			goto err;
521		/* Now calculate the length (l) of the byte sequence. */
522		length = (pt & (0xfff >> lg)) + 3;
523		/* Verify destination is in range. */
524		if (dst + length > dst_sb_end)
525			goto err;
526		/* The number of non-overlapping bytes. */
527		max_non_overlap = dst - dst_back_addr;
528		if (length <= max_non_overlap) {
529			/* The byte sequence does not overlap, just copy it. */
530			memcpy(dst, dst_back_addr, length);
531			/* Advance destination pointer. */
532			dst += length;
533		} else {
534			/*
535			 * The byte sequence does overlap, copy non-overlapping
536			 * part and then do a slow byte by byte copy for the
537			 * overlapping part.  Also, advance the destination
538			 * pointer.
539			 */
540			memcpy(dst, dst_back_addr, max_non_overlap);
541			dst += max_non_overlap;
542			dst_back_addr += max_non_overlap;
543			length -= max_non_overlap;
544			while (length--)
545				*dst++ = *dst_back_addr++;
546		}
547		/* Advance source position and continue with the next token. */
548		cb += 2;
549	}
550	/* No tokens left in the current tag.  Continue with the next tag. */
551	goto next_tag;
552err:
553	ntfs_error(vol->mp, "Compressed data is corrupt.  Run chkdsk.");
554	NVolSetErrors(vol);
555	return EOVERFLOW;
556}
557
558/**
559 * ntfs_read_compressed - read and decompress data from a compressed attribute
560 * @ni:			non-raw ntfs inode to which the raw inode belongs
561 * @raw_ni:		raw compressed ntfs inode to read from
562 * @ofs:		byte offset into uncompressed data stream to read from
563 * @count:		number of bytes to return in the destination buffer
564 * @dst_start:		destination buffer in which to return the data
565 * @pl:			page list in which @dst_start resides (or NULL)
566 * @ioflags:		flags further describing the read (see ntfs_vnop_read())
567 *
568 * Read compressed data from the raw inode @raw_ni, decompress it, and return
569 * the decompressed data in the destination buffer @dst_start of size @count
570 * bytes.
571 *
572 * If @pl is not NULL, it is the page list in which @dst_start resides with
573 * @dst_start beginning at offset zero in the first page of the page list.
574 *
575 * When @pl is not NULL, we have to check each page in the page list for being
576 * valid and if it is we have to skip decompression of its data so that we do
577 * not overwrite and dirty but not yet comitted data and even in the read-only
578 * driver we want to skip decompression in this case as there is no point in
579 * decompressing data we already have decompressed and have in cache.
580 *
581 * This function allocates a temporary buffer to hold a compression block and
582 * reads each compression block in sequence into it using cluster_read() which
583 * gives us read-ahead on the raw inode.  Once it has the data it decompresses
584 * it straight into the destination buffer @dst_start and stops when @count
585 * bytes have been decompressed (usually this will be the end of the
586 * compression block).
587 *
588 * Return 0 on success and errno on error.
589 */
590errno_t ntfs_read_compressed(ntfs_inode *ni, ntfs_inode *raw_ni, s64 ofs_start,
591		int count, u8 *dst_start, upl_page_info_t *pl, int ioflags)
592{
593	s64 ofs, init_size, raw_size, size;
594	ntfs_volume *vol = ni->vol;
595	u8 *dst, *cb;
596	uio_t uio;
597	int err, io_count, pages_per_cb, cb_size, cur_pg, cur_pg_ofs, last_pg;
598	int cb_type, zero_end_ofs, dst_ofs_in_cb;
599
600	ntfs_debug("Entering for compressed file inode 0x%llx, offset 0x%llx, "
601			"count 0x%x, ioflags 0x%x.",
602			(unsigned long long)ni->mft_no,
603			(unsigned long long)ofs_start, count, ioflags);
604	ofs = ofs_start;
605	dst = dst_start;
606	cb = NULL;
607	uio = NULL;
608	zero_end_ofs = last_pg = cur_pg_ofs = cur_pg = 0;
609	/*
610	 * We can only read from regular files and named streams that are
611	 * compressed and non-resident.  We should never be called for anything
612	 * else.
613	 */
614	if (!NInoAttr(raw_ni) || raw_ni->type != AT_DATA ||
615			!NInoCompressed(raw_ni) || !NInoNonResident(raw_ni) ||
616			!NInoRaw(raw_ni) || NInoEncrypted(raw_ni))
617		panic("%s(): Called for incorrect inode type.\n", __FUNCTION__);
618	if (ofs & PAGE_MASK || count & PAGE_MASK)
619		panic("%s(): Called with offset 0x%llx and count 0x%x at "
620				"least one of which is not a multiple of the "
621				"system page size 0x%x.\n", __FUNCTION__,
622				(unsigned long long)ofs, count, PAGE_SIZE);
623	cb_size = raw_ni->compression_block_size;
624	/*
625	 * Doing this means that we can assume that @pages_per_cb <= 1 implies
626	 * that @pl is not NULL in the below code.
627	 */
628	pages_per_cb = 0xffff;
629	if (pl) {
630		last_pg = count >> PAGE_SHIFT;
631		pages_per_cb = cb_size >> PAGE_SHIFT;
632	}
633	/*
634	 * Zero any completely uninitialized pages thus we are guaranteed only
635	 * to have a partial uninitialized page which makes the decompression
636	 * code simpler.
637	 */
638	lck_spin_lock(&ni->size_lock);
639	init_size = ni->initialized_size;
640	raw_size = ni->allocated_size;
641	if (ofs > ni->data_size)
642		panic("%s(): Called with offset 0x%llx which is beyond the "
643				"data size 0x%llx.\n", __FUNCTION__,
644				(unsigned long long)ofs,
645				(unsigned long long)ni->data_size);
646	lck_spin_unlock(&ni->size_lock);
647	size = (init_size + PAGE_MASK) & ~PAGE_MASK_64;
648	/* @ofs is page aligned and @count is at least one page in size. */
649	if (ofs + count > init_size) {
650		int start_count;
651
652		start_count = count;
653		/* Do not zero a partial final page yet. */
654		count = size - ofs;
655		/*
656		 * If the beginning of the i/o exceeds the initialized size we
657		 * just need to zero the destination and are done.
658		 */
659		if (ofs > init_size)
660			count = 0;
661		if (!pl)
662			bzero(dst + count, start_count - count);
663		else {
664			/* Only zero complete, invalid pages. */
665			for (cur_pg = count >> PAGE_SHIFT; cur_pg < last_pg;
666					cur_pg++) {
667				if (!upl_valid_page(pl, cur_pg))
668					bzero(dst + (cur_pg << PAGE_SHIFT),
669							PAGE_SIZE);
670			}
671			last_pg = count >> PAGE_SHIFT;
672			cur_pg = 0;
673		}
674		if (!count) {
675			ntfs_debug("Done (request in uninitialized region).");
676			return 0;
677		}
678		if (init_size < size)
679			zero_end_ofs = init_size - ofs;
680	}
681	dst_ofs_in_cb = ofs & (cb_size - 1);
682do_next_cb:
683	/* Truncate the final request if it is partial. */
684	io_count = cb_size - dst_ofs_in_cb;
685	if (io_count > count)
686		io_count = count;
687	/*
688	 * If a page list is present and a page is larger than or equal to a
689	 * compression block and the current page is valid, skip this whole
690	 * page.
691	 */
692	if (pages_per_cb <= 1 && upl_valid_page(pl, cur_pg)) {
693		io_count = PAGE_SIZE - cur_pg_ofs;
694		cur_pg_ofs = 0;
695		cur_pg++;
696		goto next_cb;
697	}
698	/* Determine the type of the current compression block. */
699	cb_type = ntfs_get_cb_type(raw_ni, ofs - dst_ofs_in_cb);
700	if (cb_type >= 0) {
701		err = cb_type;
702		goto err;
703	}
704	/*
705	 * If the compression block is sparse, bzero() the destination buffer
706	 * (skipping any valid pages) and go to the next compression block.
707	 */
708	if (cb_type == NTFS_CB_SPARSE) {
709		int stop_pg;
710
711		ntfs_debug("Found sparse compression block.");
712		/* Only zero invalid pages. */
713		if (!pl || pages_per_cb <= 1) {
714			bzero(dst, io_count);
715			goto pl_next_cb;
716		}
717		stop_pg = cur_pg + pages_per_cb;
718		if (stop_pg > last_pg)
719			stop_pg = last_pg;
720		for (; cur_pg < stop_pg; cur_pg++) {
721			if (!upl_valid_page(pl, cur_pg))
722				bzero(dst_start + (cur_pg << PAGE_SHIFT),
723						PAGE_SIZE);
724		}
725		goto next_cb;
726	}
727	/*
728	 * Create a uio or reset the already created one and point it at the
729	 * current attribute offset.
730	 */
731	if (!uio) {
732		uio = uio_create(1, ofs, UIO_SYSSPACE32, UIO_READ);
733		if (!uio) {
734			ntfs_error(vol->mp, "Not enough memory to allocate "
735					"uio.");
736			err = ENOMEM;
737			goto err;
738		}
739	}
740	/*
741	 * If the compression block is not compressed use cluster_read() to
742	 * read the uncompressed data straight into our destination buffer.
743	 */
744	if (cb_type == NTFS_CB_UNCOMPRESSED) {
745		int stop_pg;
746
747		ntfs_debug("Found uncompressed compression block.");
748		 /*
749		  * If no page list is present or the only page is invalid use
750		  * cluster_read() to read the uncompressed data straight into
751		  * our buffer.
752		  */
753		if (!pl || pages_per_cb <= 1) {
754			/*
755			 * Add our destination buffer to the uio so we can read
756			 * into it using cluster_read().
757			 */
758			uio_reset(uio, ofs, UIO_SYSSPACE32, UIO_READ);
759			err = uio_addiov(uio, CAST_USER_ADDR_T(dst), io_count);
760			if (err)
761				panic("%s(): uio_addiov() failed.\n",
762						__FUNCTION__);
763			err = cluster_read(raw_ni->vn, uio, raw_size, ioflags);
764			if (err || uio_resid(uio))
765				goto cl_err;
766			goto pl_next_cb;
767		}
768		/*
769		 * Page list present and multiple pages per compression block.
770		 * Iterate over all the pages reading in all invalid page
771		 * ranges straight into our buffer.
772		 */
773		stop_pg = cur_pg + pages_per_cb;
774		if (stop_pg > last_pg)
775			stop_pg = last_pg;
776		for (; cur_pg < stop_pg; cur_pg++) {
777			int start_pg, start_ofs;
778
779			/* Skip over valid page ranges. */
780			if (upl_valid_page(pl, cur_pg))
781				continue;
782			/*
783			 * We found an invalid page, determine how many
784			 * sequential pages are invalid.
785			 */
786			start_pg = cur_pg;
787			while (cur_pg + 1 < stop_pg) {
788				if (upl_valid_page(pl, cur_pg + 1))
789					break;
790				cur_pg++;
791			}
792			/*
793			 * Add our destination buffer to the uio so we can read
794			 * into it using cluster_read().
795			 */
796			start_ofs = start_pg << PAGE_SHIFT;
797			uio_reset(uio, ofs_start + start_ofs, UIO_SYSSPACE32,
798					UIO_READ);
799			err = uio_addiov(uio, CAST_USER_ADDR_T(
800					dst_start + start_ofs),
801					(cur_pg + 1 - start_pg) << PAGE_SHIFT);
802			if (err)
803				panic("%s(): uio_addiov() failed.\n",
804						__FUNCTION__);
805			err = cluster_read(raw_ni->vn, uio, raw_size, ioflags);
806			if (err || uio_resid(uio))
807				goto cl_err;
808		}
809		goto next_cb;
810	}
811	/*
812	 * The compression block is compressed.  Read the compressed data into
813	 * our temporary buffer, allocating it if we have not done so yet.
814	 */
815	ntfs_debug("Found compressed compression block.");
816	if (!cb) {
817		cb = OSMalloc(cb_size, ntfs_malloc_tag);
818		if (!cb) {
819			ntfs_error(vol->mp, "Not enough memory to allocate "
820					"temporary buffer.");
821			err = ENOMEM;
822			goto err;
823		}
824	}
825	/*
826	 * FIXME: As an optimization we could only read the actual allocated
827	 * clusters so cluster_read() does not waste time zeroing the sparse
828	 * clusters when there are whole pages worth of them.  Probably not
829	 * worth the effort as that may mess around with the read-ahead
830	 * streaming detection and having read-ahead messed up is likely to
831	 * cause a performance hit outweighing the benefit gained from not
832	 * doing the zeroing.  Especially so since we would incur overhead in
833	 * determining the number of non-sparse clusters.
834	 */
835	uio_reset(uio, ofs - dst_ofs_in_cb, UIO_SYSSPACE32, UIO_READ);
836	err = uio_addiov(uio, CAST_USER_ADDR_T(cb), cb_size);
837	if (err)
838		panic("%s(): uio_addiov() failed.\n", __FUNCTION__);
839	err = cluster_read(raw_ni->vn, uio, raw_size, ioflags);
840	if (err || uio_resid(uio))
841		goto cl_err;
842	/*
843	 * We now have the compressed data.  Decompress it into the destination
844	 * buffer skipping any valid pages if a page list is present.
845	 */
846	err = ntfs_decompress(vol, dst, dst_ofs_in_cb, io_count, cb, cb_size,
847			pl, cur_pg, pages_per_cb);
848	if (err) {
849		ntfs_error(vol->mp, "Failed to decompress data (error %d).",
850				err);
851		goto err;
852	}
853pl_next_cb:
854	if (pl) {
855		cur_pg += pages_per_cb;
856		if (!pages_per_cb) {
857			cur_pg_ofs += io_count;
858			if (cur_pg_ofs >= PAGE_SIZE) {
859				cur_pg_ofs = 0;
860				cur_pg++;
861			}
862		}
863	}
864next_cb:
865	ofs += io_count;
866	dst += io_count;
867	count -= io_count;
868	dst_ofs_in_cb = 0;
869	/* Are we done yet? */
870	if (count > 0)
871		goto do_next_cb;
872	/*
873	 * Check if the last page is partially outside the initialized size and
874	 * if so zero its uninitialized tail.
875	 */
876	if (zero_end_ofs) {
877		count = PAGE_SIZE - (zero_end_ofs & PAGE_MASK);
878		if (!pl || !upl_valid_page(pl, zero_end_ofs >> PAGE_SHIFT))
879			bzero(dst_start + zero_end_ofs, count);
880	}
881	if (uio)
882		uio_free(uio);
883	if (cb)
884		OSFree(cb, cb_size, ntfs_malloc_tag);
885	ntfs_debug("Done.");
886	return 0;
887cl_err:
888	if (err)
889		ntfs_error(vol->mp, "Failed to read %scompressed compression "
890				"block using cluster_read() (error %d).",
891				cb_type == NTFS_CB_COMPRESSED ?  "" : "un",
892				err);
893	else {
894		ntfs_error(vol->mp, "Partial read when reading %scompressed "
895				"compression block using cluster_read().",
896				cb_type == NTFS_CB_COMPRESSED ? "" : "un");
897		err = EIO;
898	}
899err:
900	if (uio)
901		uio_free(uio);
902	if (cb)
903		OSFree(cb, cb_size, ntfs_malloc_tag);
904	ntfs_error(vol->mp, "Failed (error %d).", err);
905	return err;
906}
907