1/*
2 * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 This file contains x86_64 hand optimized implementation of WKdm memory page decompressor.
31
32	void WKdm_decompress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, __unused__ unsigned int words);
33
34	input :
35		src_buf : address of input compressed data buffer
36		dest_buf : address of output decompressed buffer
37		scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller
38		words : this argument is not used in the implementation
39
40	output :
41
42		the input buffer is decompressed and the dest_buf is written with decompressed data.
43
44	Am algorithm description of the WKdm compress and bit stream format can be found in the WKdm Compress x86_64 assembly code WKdmCompress.s
45
46	The bit stream (*src_buf) consists of
47		a. 12 bytes header
48		b. 256 bytes for 1024 packed tags
49		c. (varying number of) words for new words not matched to dictionary word.
50		d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3)
51		e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1)
52
53	where the header (of 3 words) specifies the ending boundaries (in 32-bit words) from the start of the bit stream of c,d,e, respectively.
54
55	The decompressor 1st unpacking the bit stream component b/d/e into temorary buffers. Then it sequentially decodes the decompressed word as follows
56
57		for (i=0;i<1024;i++) {
58			tag = *next_tag++
59			switch (tag) {
60				case 0 : *dest_buf++ = 0; break;
61				case 1 : dict_word = dictionary[*dict_index]; dictionary[*dict_index++] = *dest_buf++ = dict_word&0xfffffc00 | *LowBits++; break;
62				case 2 : x = *new_word++; k = (x>>10)&255; k = hashTable[k]; dictionary[k] = *dest_buf++ = x; break;
63				case 3 : *dest_buf++ = dictionary[*dict_index++];  break;
64			}
65
66 	cclee, 11/30/12
67*/
68
69	.text
70
71	.globl _WKdm_decompress_new
72_WKdm_decompress_new:
73
74	// save registers, and allocate stack memory for local variables
75
76	pushq	%rbp
77	movq	%rsp, %rbp
78	pushq	%r12
79	pushq	%r13
80	pushq	%rbx
81
82	subq	$(64+8+16), %rsp
83
84	movq	%rsi, %r12					// dest_buf
85	movq	%rdx, %r13					// scracht_buf
86
87	// PRELOAD_DICTONARY; dictionary starting address : starting address 0(%rsp)
88#if 1
89	movl	$1, 0(%rsp)
90	movl	$1, 4(%rsp)
91	movl	$1, 8(%rsp)
92	movl	$1, 12(%rsp)
93	movl	$1, 16(%rsp)
94	movl	$1, 20(%rsp)
95	movl	$1, 24(%rsp)
96	movl	$1, 28(%rsp)
97	movl	$1, 32(%rsp)
98	movl	$1, 36(%rsp)
99	movl	$1, 40(%rsp)
100	movl	$1, 44(%rsp)
101	movl	$1, 48(%rsp)
102	movl	$1, 52(%rsp)
103	movl	$1, 56(%rsp)
104	movl	$1, 60(%rsp)
105#else
106	mov		$0x100000001, %rax
107	mov		%rax, (%rsp)
108	mov		%rax, 8(%rsp)
109	mov		%rax, 16(%rsp)
110	mov		%rax, 24(%rsp)
111	mov		%rax, 32(%rsp)
112	mov		%rax, 40(%rsp)
113	mov		%rax, 48(%rsp)
114	mov		%rax, 56(%rsp)
115#endif
116
117	// WK_unpack_2bits(TAGS_AREA_START(src_buf), TAGS_AREA_END(src_buf), tempTagsArray);
118
119	leaq	268(%rdi), %r10				// TAGS_AREA_END
120	leaq	12(%rdi), %rax				// TAGS_AREA_START
121	movq	%r13, %rsi					// tempTagsArray
122	cmpq	%rax, %r10					// TAGS_AREA_END vs TAGS_AREA_START
123	jbe		1f							// if TAGS_AREA_END <= TAGS_AREA_START, skip L_WK_unpack_2bits
124	movq	%r13, %rcx					// next_word
125	xorl	%r8d, %r8d					// i = 0
126	mov		$(50529027<<32)+50529027, %r9
127L_WK_unpack_2bits:
128	movl	12(%rdi,%r8, 4), %eax
129	movl	12(%rdi,%r8, 4), %edx
130	shrl	$2, %eax
131	shlq	$32, %rax
132	orq		%rdx, %rax
133	movq	%rax, %rdx
134	shrq	$4, %rax
135	andq	%r9, %rdx
136	andq	%r9, %rax
137	incq	%r8							// i++
138	movq	%rdx, (%rcx)
139	movq	%rax, 8(%rcx)
140	addq	$16, %rcx					// next_tags += 16
141	cmpq	$64, %r8					// i vs 64
142	jne		L_WK_unpack_2bits			// repeat loop until i==64
1431:
144
145
146	// WK_unpack_4bits(QPOS_AREA_START(src_buf), QPOS_AREA_END(src_buf), tempQPosArray);
147
148	mov		4(%rdi), %eax				// WKdm header qpos end
149	leaq	(%rdi,%rax,4), %r9			// QPOS_AREA_END
150	mov		0(%rdi), %eax				// WKdm header qpos start
151	leaq	(%rdi,%rax,4), %r8			// QPOS_AREA_START
152	leaq	1024(%r13), %rbx			// tempQPosArray
153	cmpq	%r8, %r9					// QPOS_AREA_END vs QPOS_AREA_START
154	jbe		1f							// if QPOS_AREA_END <= QPOS_AREA_START, skip L_WK_unpack_4bits
155	leaq	8(%rbx), %rcx				// next_qpos
156
157	mov		$(252645135<<32)+252645135, %r11
158L_WK_unpack_4bits:
159	movl	(%r8), %eax					// w = *next_word
160	movl	%eax, %edx					// w
161	shlq	$28, %rax
162	orq		%rdx, %rax
163	addq	$4, %r8						// next_word++
164	andq	%r11, %rax
165	movq	%rax, -8(%rcx)
166	addq	$8, %rcx					// next_qpos+=8
167	cmpq	%r8, %r9					// QPOS_AREA_END vs QPOS_AREA_START
168	ja		L_WK_unpack_4bits			// repeat loop until QPOS_AREA_END <= QPOS_AREA_START
169
170
1711:
172
173	// WK_unpack_3_tenbits(LOW_BITS_AREA_START(src_buf), LOW_BITS_AREA_END(src_buf), tempLowBitsArray);
174
175	movl	8(%rdi), %eax				// LOW_BITS_AREA_END offset
176	leaq	(%rdi,%rax,4), %rdi			// LOW_BITS_AREA_END
177	leaq	2048(%r13), %r11			// tempLowBitsArray
178	leaq	4094(%r13), %r13			// final tenbits addr
179	sub		%r9, %rdi					// LOW_BITS_AREA_START vs LOW_BITS_AREA_END
180	jle		1f							// if START>=END, skip L_WK_unpack_3_tenbits
181	movq	%r11, %rcx					// next_low_bits
182L_WK_unpack_3_tenbits:
183	movl	(%r9), %eax					// w = *next_word, 0:c:b:a
184	movl	$(1023<<10), %edx
185	movl	$(1023<<20), %r8d
186	andl	%eax, %edx					// b << 10
187	andl	%eax, %r8d					// c << 20
188	andq	$1023, %rax
189	shll	$6, %edx
190	shlq	$12, %r8
191	orl		%edx, %eax
192	orq		%r8, %rax
193	cmp		%r13, %rcx
194	je		2f
195	mov		%rax, (%rcx)
196	jmp		3f
1972:	mov		%ax, (%rcx)
1983:
199	addq	$4, %r9						// next_word++
200	addq	$6, %rcx					// next_low_bits += 3
201	sub		$4, %rdi
202	jg		L_WK_unpack_3_tenbits		// repeat loop if LOW_BITS_AREA_END > next_word
2031:
204
205
206	#define	next_qpos		%rbx
207	#define	hash			%r8
208	#define	tags_counter	%edi
209	#define	dest_buf		%r12
210	#define next_full_patt	%r10
211
212	leaq	_hashLookupTable_new(%rip), hash	// hash look up table
213	movl	$1024, tags_counter				// tags_counter
214	jmp		L_next
215
216	.align 4,0x90
217L_nonpartital:
218	jl		L_ZERO_TAG
219	cmpb	$2, -1(%rsi)
220	je		L_MISS_TAG
221
222L_EXACT_TAG:
223	movzbl	(next_qpos), %eax				// qpos = *next_qpos
224	incq	next_qpos						// next_qpos++
225	decl	tags_counter					// tags_counter--
226	movl	(%rsp,%rax,4), %eax				// w = dictionary[qpos]
227	movl	%eax, -4(dest_buf)				// *dest_buf = w
228	je		L_done
229
230L_next:
231	incq	%rsi							// next_tag++
232	addq	$4, dest_buf
233	cmpb	$1, -1(%rsi)
234	jne		L_nonpartital
235
236L_PARTIAL_TAG:
237	movzbl	(next_qpos),%edx				// qpos = *next_qpos
238	incq	next_qpos						// next_qpos++
239	movl	(%rsp,%rdx,4), %eax				// read dictionary word
240	andl	$-1024, %eax					// clear lower 10 bits
241	or		(%r11), %ax						// pad the lower 10-bits from *next_low_bits
242	addq	$2, %r11						// next_low_bits++
243	decl	tags_counter					// tags_counter--
244	movl	%eax, (%rsp,%rdx,4)				// *dict_location = newly formed word
245	movl	%eax, -4(dest_buf)				// *dest_buf = newly formed word
246	jg		L_next							// repeat loop until next_tag==tag_area_end
247
248L_done:
249
250	// release stack memory, restore registers, and return
251
252	addq	$(64+8+16), %rsp
253	popq	%rbx
254	popq	%r13
255	popq	%r12
256	leave
257	ret
258
259	.align 4,0x90
260L_MISS_TAG:
261	movl	(next_full_patt), %edx			// w = *next_full_patt
262	movl	(next_full_patt), %eax			// w = *next_full_patt
263	shrl	$10, %edx						// w>>10
264	addq	$4, next_full_patt				// next_full_patt++
265	movzbl	%dl, %edx						// 8-bit hash table index
266	movl	%eax, -4(dest_buf)				// *dest_buf = word
267	movzbl	(hash,%rdx),%edx				// qpos
268	decl	tags_counter					// tags_counter--
269	movl	%eax, (%rsp,%rdx)				// dictionary[qpos] = word
270	jg		L_next							// repeat the loop
271	jmp		L_done
272
273	.align 4,0x90
274L_ZERO_TAG:
275	decl	tags_counter					// tags_counter--
276	movl	$0, -4(dest_buf)					// *dest_buf = 0
277	jg		L_next							// repeat the loop
278	jmp		L_done
279
280
281