1; inffasx64.asm is a hand tuned assembler version of inffast.c - fast decoding
2; version for AMD64 on Windows using Microsoft C compiler
3;
4; inffasx64.asm is automatically convert from AMD64 portion of inffas86.c
5; inffasx64.asm is called by inffas8664.c, which contain more info.
6
7
8; to compile this file, I use option
9;   ml64.exe /Flinffasx64 /c /Zi inffasx64.asm
10;   with Microsoft Macro Assembler (x64) for AMD64
11;
12;   ml64.exe is given with Visual Studio 2005, Windows 2003 server DDK
13;
14;   (you can get Windows 2003 server DDK with ml64 and cl.exe for AMD64 from 
15;      http://www.microsoft.com/whdc/devtools/ddk/default.mspx for low price)
16;
17
18.code
19inffas8664fnc PROC
20
21; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
22; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
23;
24; All registers must be preserved across the call, except for
25;   rax, rcx, rdx, r8, r-9, r10, and r11, which are scratch.
26
27
28	mov [rsp-8],rsi
29	mov [rsp-16],rdi
30	mov [rsp-24],r12
31	mov [rsp-32],r13
32	mov [rsp-40],r14
33	mov [rsp-48],r15
34	mov [rsp-56],rbx
35
36	mov rax,rcx
37
38	mov	[rax+8], rbp       ; /* save regs rbp and rsp */
39	mov	[rax], rsp
40
41	mov	rsp, rax          ; /* make rsp point to &ar */
42
43	mov	rsi, [rsp+16]      ; /* rsi  = in */
44	mov	rdi, [rsp+32]      ; /* rdi  = out */
45	mov	r9, [rsp+24]       ; /* r9   = last */
46	mov	r10, [rsp+48]      ; /* r10  = end */
47	mov	rbp, [rsp+64]      ; /* rbp  = lcode */
48	mov	r11, [rsp+72]      ; /* r11  = dcode */
49	mov	rdx, [rsp+80]      ; /* rdx  = hold */
50	mov	ebx, [rsp+88]      ; /* ebx  = bits */
51	mov	r12d, [rsp+100]    ; /* r12d = lmask */
52	mov	r13d, [rsp+104]    ; /* r13d = dmask */
53                                          ; /* r14d = len */
54                                          ; /* r15d = dist */
55
56
57	cld
58	cmp	r10, rdi
59	je	L_one_time           ; /* if only one decode left */
60	cmp	r9, rsi
61
62    jne L_do_loop
63
64
65L_one_time:
66	mov	r8, r12           ; /* r8 = lmask */
67	cmp	bl, 32
68	ja	L_get_length_code_one_time
69
70	lodsd                         ; /* eax = *(uint *)in++ */
71	mov	cl, bl            ; /* cl = bits, needs it for shifting */
72	add	bl, 32             ; /* bits += 32 */
73	shl	rax, cl
74	or	rdx, rax          ; /* hold |= *((uint *)in)++ << bits */
75	jmp	L_get_length_code_one_time
76
77ALIGN 4
78L_while_test:
79	cmp	r10, rdi
80	jbe	L_break_loop
81	cmp	r9, rsi
82	jbe	L_break_loop
83
84L_do_loop:
85	mov	r8, r12           ; /* r8 = lmask */
86	cmp	bl, 32
87	ja	L_get_length_code    ; /* if (32 < bits) */
88
89	lodsd                         ; /* eax = *(uint *)in++ */
90	mov	cl, bl            ; /* cl = bits, needs it for shifting */
91	add	bl, 32             ; /* bits += 32 */
92	shl	rax, cl
93	or	rdx, rax          ; /* hold |= *((uint *)in)++ << bits */
94
95L_get_length_code:
96	and	r8, rdx            ; /* r8 &= hold */
97	mov	eax, [rbp+r8*4]  ; /* eax = lcode[hold & lmask] */
98
99	mov	cl, ah            ; /* cl = this.bits */
100	sub	bl, ah            ; /* bits -= this.bits */
101	shr	rdx, cl           ; /* hold >>= this.bits */
102
103	test	al, al
104	jnz	L_test_for_length_base ; /* if (op != 0) 45.7% */
105
106	mov	r8, r12            ; /* r8 = lmask */
107	shr	eax, 16            ; /* output this.val char */
108	stosb
109
110L_get_length_code_one_time:
111	and	r8, rdx            ; /* r8 &= hold */
112	mov	eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */
113
114L_dolen:
115	mov	cl, ah            ; /* cl = this.bits */
116	sub	bl, ah            ; /* bits -= this.bits */
117	shr	rdx, cl           ; /* hold >>= this.bits */
118
119	test	al, al
120	jnz	L_test_for_length_base ; /* if (op != 0) 45.7% */
121
122	shr	eax, 16            ; /* output this.val char */
123	stosb
124	jmp	L_while_test
125
126ALIGN 4
127L_test_for_length_base:
128	mov	r14d, eax         ; /* len = this */
129	shr	r14d, 16           ; /* len = this.val */
130	mov	cl, al
131
132	test	al, 16
133	jz	L_test_for_second_level_length ; /* if ((op & 16) == 0) 8% */
134	and	cl, 15             ; /* op &= 15 */
135	jz	L_decode_distance    ; /* if (!op) */
136
137L_add_bits_to_len:
138	sub	bl, cl
139	xor	eax, eax
140	inc	eax
141	shl	eax, cl
142	dec	eax
143	and	eax, edx          ; /* eax &= hold */
144	shr	rdx, cl
145	add	r14d, eax         ; /* len += hold & mask[op] */
146
147L_decode_distance:
148	mov	r8, r13           ; /* r8 = dmask */
149	cmp	bl, 32
150	ja	L_get_distance_code  ; /* if (32 < bits) */
151
152	lodsd                         ; /* eax = *(uint *)in++ */
153	mov	cl, bl            ; /* cl = bits, needs it for shifting */
154	add	bl, 32             ; /* bits += 32 */
155	shl	rax, cl
156	or	rdx, rax          ; /* hold |= *((uint *)in)++ << bits */
157
158L_get_distance_code:
159	and	r8, rdx           ; /* r8 &= hold */
160	mov	eax, [r11+r8*4] ; /* eax = dcode[hold & dmask] */
161
162L_dodist:
163	mov	r15d, eax         ; /* dist = this */
164	shr	r15d, 16           ; /* dist = this.val */
165	mov	cl, ah
166	sub	bl, ah            ; /* bits -= this.bits */
167	shr	rdx, cl           ; /* hold >>= this.bits */
168	mov	cl, al            ; /* cl = this.op */
169
170	test	al, 16             ; /* if ((op & 16) == 0) */
171	jz	L_test_for_second_level_dist
172	and	cl, 15             ; /* op &= 15 */
173	jz	L_check_dist_one
174
175L_add_bits_to_dist:
176	sub	bl, cl
177	xor	eax, eax
178	inc	eax
179	shl	eax, cl
180	dec	eax                 ; /* (1 << op) - 1 */
181	and	eax, edx          ; /* eax &= hold */
182	shr	rdx, cl
183	add	r15d, eax         ; /* dist += hold & ((1 << op) - 1) */
184
185L_check_window:
186	mov	r8, rsi           ; /* save in so from can use it's reg */
187	mov	rax, rdi
188	sub	rax, [rsp+40]      ; /* nbytes = out - beg */
189
190	cmp	eax, r15d
191	jb	L_clip_window        ; /* if (dist > nbytes) 4.2% */
192
193	mov	ecx, r14d         ; /* ecx = len */
194	mov	rsi, rdi
195	sub	rsi, r15          ; /* from = out - dist */
196
197	sar	ecx, 1
198	jnc	L_copy_two           ; /* if len % 2 == 0 */
199
200	rep     movsw
201	mov	al, [rsi]
202	mov	[rdi], al
203	inc	rdi
204
205	mov	rsi, r8           ; /* move in back to %rsi, toss from */
206	jmp	L_while_test
207
208L_copy_two:
209	rep     movsw
210	mov	rsi, r8           ; /* move in back to %rsi, toss from */
211	jmp	L_while_test
212
213ALIGN 4
214L_check_dist_one:
215	cmp	r15d, 1            ; /* if dist 1, is a memset */
216	jne	L_check_window
217	cmp	[rsp+40], rdi      ; /* if out == beg, outside window */
218	je	L_check_window
219
220	mov	ecx, r14d         ; /* ecx = len */
221	mov	al, [rdi-1]
222	mov	ah, al
223
224	sar	ecx, 1
225	jnc	L_set_two
226	mov	[rdi], al
227	inc	rdi
228
229L_set_two:
230	rep     stosw
231	jmp	L_while_test
232
233ALIGN 4
234L_test_for_second_level_length:
235	test	al, 64
236	jnz	L_test_for_end_of_block ; /* if ((op & 64) != 0) */
237
238	xor	eax, eax
239	inc	eax
240	shl	eax, cl
241	dec	eax
242	and	eax, edx         ; /* eax &= hold */
243	add	eax, r14d        ; /* eax += len */
244	mov	eax, [rbp+rax*4] ; /* eax = lcode[val+(hold&mask[op])]*/
245	jmp	L_dolen
246
247ALIGN 4
248L_test_for_second_level_dist:
249	test	al, 64
250	jnz	L_invalid_distance_code ; /* if ((op & 64) != 0) */
251
252	xor	eax, eax
253	inc	eax
254	shl	eax, cl
255	dec	eax
256	and	eax, edx         ; /* eax &= hold */
257	add	eax, r15d        ; /* eax += dist */
258	mov	eax, [r11+rax*4] ; /* eax = dcode[val+(hold&mask[op])]*/
259	jmp	L_dodist
260
261ALIGN 4
262L_clip_window:
263	mov	ecx, eax         ; /* ecx = nbytes */
264	mov	eax, [rsp+92]     ; /* eax = wsize, prepare for dist cmp */
265	neg	ecx                ; /* nbytes = -nbytes */
266
267	cmp	eax, r15d
268	jb	L_invalid_distance_too_far ; /* if (dist > wsize) */
269
270	add	ecx, r15d         ; /* nbytes = dist - nbytes */
271	cmp	dword ptr [rsp+96], 0
272	jne	L_wrap_around_window ; /* if (write != 0) */
273
274	mov	rsi, [rsp+56]     ; /* from  = window */
275	sub	eax, ecx         ; /* eax  -= nbytes */
276	add	rsi, rax         ; /* from += wsize - nbytes */
277
278	mov	eax, r14d        ; /* eax = len */
279	cmp	r14d, ecx
280	jbe	L_do_copy           ; /* if (nbytes >= len) */
281
282	sub	eax, ecx         ; /* eax -= nbytes */
283	rep     movsb
284	mov	rsi, rdi
285	sub	rsi, r15         ; /* from = &out[ -dist ] */
286	jmp	L_do_copy
287
288ALIGN 4
289L_wrap_around_window:
290	mov	eax, [rsp+96]     ; /* eax = write */
291	cmp	ecx, eax
292	jbe	L_contiguous_in_window ; /* if (write >= nbytes) */
293
294	mov	esi, [rsp+92]     ; /* from  = wsize */
295	add	rsi, [rsp+56]     ; /* from += window */
296	add	rsi, rax         ; /* from += write */
297	sub	rsi, rcx         ; /* from -= nbytes */
298	sub	ecx, eax         ; /* nbytes -= write */
299
300	mov	eax, r14d        ; /* eax = len */
301	cmp	eax, ecx
302	jbe	L_do_copy           ; /* if (nbytes >= len) */
303
304	sub	eax, ecx         ; /* len -= nbytes */
305	rep     movsb
306	mov	rsi, [rsp+56]     ; /* from = window */
307	mov	ecx, [rsp+96]     ; /* nbytes = write */
308	cmp	eax, ecx
309	jbe	L_do_copy           ; /* if (nbytes >= len) */
310
311	sub	eax, ecx         ; /* len -= nbytes */
312	rep     movsb
313	mov	rsi, rdi
314	sub	rsi, r15         ; /* from = out - dist */
315	jmp	L_do_copy
316
317ALIGN 4
318L_contiguous_in_window:
319	mov	rsi, [rsp+56]     ; /* rsi = window */
320	add	rsi, rax
321	sub	rsi, rcx         ; /* from += write - nbytes */
322
323	mov	eax, r14d        ; /* eax = len */
324	cmp	eax, ecx
325	jbe	L_do_copy           ; /* if (nbytes >= len) */
326
327	sub	eax, ecx         ; /* len -= nbytes */
328	rep     movsb
329	mov	rsi, rdi
330	sub	rsi, r15         ; /* from = out - dist */
331	jmp	L_do_copy           ; /* if (nbytes >= len) */
332
333ALIGN 4
334L_do_copy:
335	mov	ecx, eax         ; /* ecx = len */
336	rep     movsb
337
338	mov	rsi, r8          ; /* move in back to %esi, toss from */
339	jmp	L_while_test
340
341L_test_for_end_of_block:
342	test	al, 32
343	jz	L_invalid_literal_length_code
344	mov	dword ptr [rsp+116], 1
345	jmp	L_break_loop_with_status
346
347L_invalid_literal_length_code:
348	mov	dword ptr [rsp+116], 2
349	jmp	L_break_loop_with_status
350
351L_invalid_distance_code:
352	mov	dword ptr [rsp+116], 3
353	jmp	L_break_loop_with_status
354
355L_invalid_distance_too_far:
356	mov	dword ptr [rsp+116], 4
357	jmp	L_break_loop_with_status
358
359L_break_loop:
360	mov	dword ptr [rsp+116], 0
361
362L_break_loop_with_status:
363; /* put in, out, bits, and hold back into ar and pop esp */
364	mov	[rsp+16], rsi     ; /* in */
365	mov	[rsp+32], rdi     ; /* out */
366	mov	[rsp+88], ebx     ; /* bits */
367	mov	[rsp+80], rdx     ; /* hold */
368
369	mov	rax, [rsp]       ; /* restore rbp and rsp */
370	mov	rbp, [rsp+8]
371	mov	rsp, rax
372
373
374
375	mov rsi,[rsp-8]
376	mov rdi,[rsp-16]
377	mov r12,[rsp-24]
378	mov r13,[rsp-32]
379	mov r14,[rsp-40]
380	mov r15,[rsp-48]
381	mov rbx,[rsp-56]
382	
383    ret 0
384;          :
385;          : "m" (ar)
386;          : "memory", "%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi",
387;            "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"
388;    );
389
390inffas8664fnc 	ENDP
391;_TEXT	ENDS
392END
393