1#if (defined __i386__)
2
3/* this assembly was 1st compiled from inffast.c (assuming POSTINC defined, OFF=0) and then hand optimized */
4
5	.cstring
6LC0:
7	.ascii "invalid distance too far back\0"
8LC1:
9	.ascii "invalid distance code\0"
10LC2:
11	.ascii "invalid literal/length code\0"
12	.text
13	.align 4,0x90
14
15
16#ifdef  INFLATE_STRICT
17	.byte 0
18	.byte 0
19	.byte 0
20	.byte 0
21	.byte 0
22	.byte 0
23	.byte 0
24	.byte 0
25	.byte 0
26	.byte 0
27#endif
28.globl _inflate_fast
29_inflate_fast:
30
31	// set up ebp to refer to arguments strm and start
32	pushl	%ebp
33	movl	%esp, %ebp
34
35	// push edi/esi/ebx into stack
36	pushl	%edi
37	pushl	%esi
38	pushl	%ebx
39
40	// allocate for local variables 92-12=80, + 12 to align %esp to 16-byte boundary
41	subl	$92, %esp
42	movl	8(%ebp), %ebx
43
44	/* definitions to help code readability */
45
46	#define	bits	%edi
47	#define	strm	%ebx
48	#define	state	28(strm)		// state = (struct inflate_state FAR *)strm->state;
49	#define	in		-84(%ebp)		// in = strm->next_in - OFF; OFF=0
50	#define	last	-80(%ebp)		// last = in + (strm->avail_in - 5);
51	#define	out		-28(%ebp)		// out = strm->next_out - OFF;
52	#define	beg		-76(%ebp)		// beg = out - (start - strm->avail_out);
53	#define	end		-72(%ebp)		// end = out + (strm->avail_out - 257);
54	#define	wsize	-68(%ebp)		// wsize = state->wsize;
55	#define whave	-64(%ebp)		// whave = state->whave;
56	#define write	-60(%ebp)		// write = state->write;
57	#define window	-56(%ebp)		// window = state->window;
58	#define	hold	-52(%ebp)		// hold = state->hold;
59	#define	lcode	-48(%ebp)		// lcode = state->lencode;
60	#define	dcode	-44(%ebp)		// dcode = state->distcode;
61	#define	lmask	-40(%ebp)		// lmask = (1U << state->lenbits) - 1;
62	#define	dmask	-36(%ebp)		// dmask = (1U << state->distbits) - 1;
63	#define	len		-32(%ebp)
64	#define dmax	-20(%ebp)
65	#define	dist	-16(%ebp)		// dist
66	#define	write_wsize	-24(%ebp)	// write+wsize
67	#define	write_1		-88(%ebp)	// write-1
68	#define	op		-92(%ebp)		// op
69
70	movl	(strm), %eax			// strm->next_in
71	movl	%eax, in				// in = strm->next_in - OFF; OFF=0
72
73	subl	$5, %eax				// in - 5;
74	movl	4(strm), %ecx			// strm->avail_in
75	addl	%ecx, %eax				// in + (strm->avail_in - 5);
76	movl	%eax, last				// last = in + (strm->avail_in - 5);
77
78	movl	12(strm), %esi			// strm->next_out
79	movl	%esi, out				// out = strm->next_out - OFF;
80
81	movl	16(strm), %ecx			// strm->avail_out
82	movl	%esi, %eax				// out
83	subl	12(%ebp), %eax			// out - start
84	addl	%ecx, %eax				// out - (start - strm->avail_out);
85	movl	%eax, beg				// beg = out - (start - strm->avail_out);
86
87	leal	-257(%esi,%ecx), %ecx	// out + (strm->avail_out - 257);
88	movl	%ecx, end				// end = out + (strm->avail_out - 257);
89
90	movl	state, %edx
91
92#ifdef	INFLATE_STRICT
93	movl	20(%edx), %ecx			// state->dmax
94	movl	%ecx, dmax				// dmax = state->dmax;
95#endif
96
97	movl	40(%edx), %ecx			// state->wsize
98	movl	%ecx, wsize				// wsize = state->wsize;
99
100	movl	44(%edx), %ecx			// state->whave
101	movl	%ecx, whave				// whave = state->whave;
102
103	movl	48(%edx), %esi			// state->write
104	movl	%esi, write				// write = state->write;
105
106	movl	52(%edx), %eax			// state->window
107	movl	%eax, window			// window = state->window;
108
109
110	movl	56(%edx), %ecx			// state->hold
111	movl	%ecx, hold				// hold = state->hold
112
113	movl	60(%edx), bits			// bits = state->bits;
114
115	movl	76(%edx), %esi			// state->lencode
116	movl	%esi, lcode				// lcode = state->lencode;
117
118	movl	80(%edx), %eax			// state->distcode
119	movl	%eax, dcode				// dcode = state->distcode;
120
121	movl	84(%edx), %ecx			// state->lenbits
122	movl	$1, %eax
123	movl	%eax, %esi				// a copy of 1
124	sall	%cl, %esi				// 1 << state->lenbits
125	decl	%esi					// (1U << state->lenbits) - 1;
126	movl	%esi, lmask				// lmask = (1U << state->lenbits) - 1;
127
128	movl	88(%edx), %ecx			// state->distbits
129	sall	%cl, %eax				// 1 << state->distbits
130	decl	%eax					// (1U << state->distbits) - 1;
131	movl	%eax, dmask				// dmask = (1U << state->distbits) - 1;
132
133
134	// these 2 might be used often, precomputed and saved in stack
135	movl	write, %eax
136	addl	wsize, %eax
137	movl	%eax, write_wsize		// write+wsize
138
139	movl	write, %edx
140	decl	%edx
141	movl	%edx, write_1			// write-1
142
143
144L_do_while_loop:						// do {
145
146	cmpl	$15, bits
147	jae		bits_ge_15					//		if (bits < 15) {
148#if 0
149	leal	8(bits), %esi				// esi = bits+8
150	movl	in, %eax					// eax = in
151	movzbl	(%eax), %edx				// edx = *in++
152	movl	bits, %ecx					// cl = bits
153	sall	%cl, %edx					// 1st *in << bits
154	addl	hold, %edx					// hold += 1st *in << bits
155	movzbl	1(%eax), %eax				// 2nd *in
156	movl	%esi, %ecx					// cl = bits+8
157	sall	%cl, %eax					// 2nd *in << (bits+8)
158	addl	%eax, %edx					// hold += 2nd *in << (bits+8)
159	movl	%edx, hold					// update hold
160	addl	$2, in						// in += 2
161	addl	$16, bits					// bits += 16;
162#else
163	/* from simulation, this code segment performs better than the other case
164		possibly, we are more often hit with aligned memory access */
165	movl	in, %ecx					//			unsigned short *inp = (unsigned short *) (in+OFF);
166	movzwl	(%ecx), %eax				// 			*((unsigned short *) in);
167	movl	bits, %ecx					//			bits
168	sall	%cl, %eax					// 			*((unsigned short *) in) << bits
169	addl	%eax, hold					// 			hold += (unsigned long) *((unsigned short *) in) << bits;
170	addl	$2, in						// 			in += 2;
171	addl	$16, bits					// 			bits += 16;
172#endif
173
174bits_ge_15:								// 		}	/* bits < 15 */
175
176	movl	hold, %eax					// 		hold
177	andl	lmask, %eax					// 		hold & lmask;
178	movl	lcode, %esi					// 		lcode[] : 4-byte aligned
179	movl	(%esi,%eax,4), %eax			// 		this = lcode[hold&lmask];
180	jmp		dolen
181	.align 4,0x90
182op_nonzero:
183	movzbl	%al, %ecx					// a copy of op to cl
184	testb	$16, %cl					// if op&16
185	jne		Llength_base				// 		branch to length_base
186
187	testb	$64, %cl					// elif op&64
188	jne		length_2nd_level_else		//		branch to 2nd level length code else conditions
189
190	// 2nd level length code
191
192	movl	$1, %eax
193	sall	%cl, %eax					// 1 << op
194	decl	%eax						// ((1<<op) - 1)
195	andl	hold, %eax					// hold & ((1U << op) - 1)
196	movzwl	%si, %ecx					// this.val
197	addl	%ecx, %eax					// this.val + (hold & ((1U << op) - 1))
198
199	movl	lcode, %ecx					// lcode[] : 4-byte aligned
200	movl	(%ecx,%eax,4), %eax			// this = lcode[this.val + (hold & ((1U << op) - 1))];
201										// goto dolen (compiler rearranged the order of code)
202dolen:
203	movl	%eax, %esi					// make a copy of this (val 16-bit, bits 8-bit, op 8-bit)
204	shrl	$16, %esi					// %esi = this.val;
205	movzbl	%ah, %ecx					// op = (unsigned)(this.bits);
206	shrl	%cl, hold					// hold >>= op;
207	subl	%ecx, bits					// bits -= op;
208	testb	%al, %al					// op = (unsigned)(this.op);
209	jne		op_nonzero					// if op!=0, branch to op_nonzero
210
211	movl	%esi, %ecx					// this.val;
212	movl	out, %eax					// out
213	movb	%cl, (%eax)					// PUP(out) = (unsigned char)(this.val);
214	incl	%eax						// out++;
215	movl	%eax, out					// save out
216
217L_tst_do_while_loop_end:
218	movl	last, %eax					// last
219	cmpl	%eax, in					// in vs last
220	jae		return_unused_bytes 		// branch to return_unused_bytes if in >= last
221	movl	end, %edx					// end
222	cmpl	%edx, out					// out vs end
223	jb		L_do_while_loop				// branch to do loop if out < end
224
225return_unused_bytes:
226
227	movl	bits, %eax					// bits
228	shrl	$3, %eax					// len = bits >> 3
229	movl	in, %edx					// in
230	subl	%eax, %edx					// in -= len
231	sall	$3, %eax					// len << 3
232	movl	bits, %ecx					// bits
233	subl	%eax, %ecx					// bits -= len << 3
234
235	movl	%edx, (strm)				// strm->next_in = in + OFF;
236	movl	out, %eax
237	movl	%eax, 12(strm)				// strm->next_out = out + OFF;
238
239	cmpl	%edx, last					// last vs in
240	jbe		L67							// if (last <= in) branch to L67 and return to L69
241	movl	last, %eax					// last
242	addl	$5, %eax					// 5 + last
243	subl	%edx, %eax					// 5 + last - in
244L69:
245	movl	%eax, 4(strm)				// update strm->avail_in
246
247	movl	end, %eax
248	cmpl	%eax, out					// out vs end
249	jae		L70							// if (out>=end) branch to L70, and return to L72
250	addl	$257, %eax					// 257 + end
251	subl	out, %eax					// 257 + end - out
252L72:
253	movl	%eax, 16(strm)				// update strm->avail_out
254
255	movl	$1, %eax
256	sall	%cl, %eax					// 1 << bits
257	decl	%eax						// (1 << bits) -1
258	andl	hold, %eax					// hold &= (1U << bits) - 1;
259	movl	state, %esi
260	movl	%eax, 56(%esi)				// state->hold = hold;
261	movl	%ecx, 60(%esi)				// state->bits = bits;
262
263	addl	$92, %esp					// pop out local from stack
264
265	// restore saved registers and return
266	popl	%ebx
267	popl	%esi
268	popl	%edi
269	leave
270	ret
271
272	// this code segment is branched in from op_nonzero, with op in cl and this.value in esi
273Llength_base:
274	movzwl	%si, %esi			// this instruction might not be needed, pad here to give better performance
275	movl	%esi, len			// len = (unsigned)(this.val);
276
277	movl	%ecx, %esi			// leave a copy of op at ecx
278	andl	$15, %esi			// op&=15;
279	je		Lop_is_zero			// if (op) {
280	cmpl	bits, %esi			//		op vs bits
281	jbe		Lop_be_bits			//		if (bits < op) {
282	movl	in, %edx			//			in
283	movzbl	(%edx), %eax		//			*in
284	movl	bits, %ecx			//			bits
285	sall	%cl, %eax			//			*in << bits
286	addl	%eax, hold			// 			hold += (unsigned long)(PUP(in)) << bits;
287	incl	%edx				//			in++
288	movl	%edx, in			//			update in
289	addl	$8, bits			//			bits += 8
290Lop_be_bits:					//		}
291	movl	$1, %eax			//		1
292	movl	%esi, %ecx			//		op
293	sall	%cl, %eax			//		1 << op
294	decl	%eax				// 		(1<<op)-1
295	andl	hold, %eax			//		hold & ((1U << op) - 1)
296	addl	%eax, len			//		len += (unsigned)hold & ((1U << op) - 1);
297	shrl	%cl, hold			//		hold >>= op;
298	subl	%esi, bits			//		bits -= op;
299Lop_is_zero:					// }
300	cmpl	$14, bits			// if (bits < 15) {
301	jbe		bits_le_14			//		branch to refill 16-bit into hold, and branch back to next
302L19:							// }
303	movl	hold, %eax			// hold
304	andl	dmask, %eax			// hold&dmask
305	movl	dcode, %esi			// dcode[] : 4-byte aligned
306	movl	(%esi,%eax,4), %eax	// this = dcode[hold & dmask];
307	jmp		dodist
308
309Lop_16_zero:
310	testb	$64, %cl					// op&64
311	jne		Linvalid_distance_code		// if (op&64)!=0, branch to invalid distance code
312	movl	$1, %eax					// 1
313	sall	%cl, %eax					// (1<<op)
314	decl	%eax						// (1<<op)-1
315	andl	hold, %eax					// (hold & ((1U << op) - 1))
316	movzwl	%dx, %edx					// this.val
317	addl	%edx, %eax					// this.val + (hold & ((1U << op) - 1))
318	movl	dcode, %edx					// dcode[] : 4 byte aligned
319	movl	(%edx,%eax,4), %eax			// this = dcode[this.val + (hold & ((1U << op) - 1))];
320dodist:
321	movl	%eax, %edx					// this : (val 16-bit, bits 8-bit, op 8-bit)
322	shrl	$16, %edx					// edx = this.val
323	movzbl	%ah, %ecx					// op = (unsigned)(this.bits);
324	shrl	%cl, hold					// hold >>= op;
325	subl	%ecx, bits					// bits -= op;
326	movzbl	%al, %ecx					// op = (unsigned)(this.op);
327	testb	$16, %cl					// op & 16
328	je		Lop_16_zero					// if (op&16)==0 goto test op&64
329
330Ldistance_base:							// if (op&16) {		/* distance base */
331	andl	$15, %ecx					//	  op &= 15; edx = dist = this.val;
332	movl	%ecx, op					// 		save a copy of op
333	cmpl	bits, %ecx					//		op vs bits
334	jbe		0f							//		if (bits < op) {
335	movl	in, %ecx					//			in
336	movzbl	(%ecx), %eax				//			*in
337	movl	bits, %ecx					//			bits
338	sall	%cl, %eax					//			*in << bits
339	addl	%eax, hold					//			hold += (unsigned long)(PUP(in)) << bits;
340	incl	in							//			in++
341	addl	$8, bits					//			bits += 8
342	cmpl	bits, op					//			op vs bits
343	jbe		0f							//			if (bits < op) {
344	movl	in, %esi					//				i
345	movzbl	(%esi), %eax				// 				*in
346	movl	bits, %ecx					//				cl = bits
347	sall	%cl, %eax					//				*in << bits
348	addl	%eax, hold					//				hold += (unsigned long)(PUP(in)) << bits;
349	incl	%esi						//				in++
350	movl	%esi, in					//				update in
351	addl	$8, bits					//				bits += 8
3520:										// }		}
353
354	movzwl	%dx, %edx					// dist = (unsigned)(this.val);
355	movl	$1, %eax					// 1
356	movzbl	op, %ecx					// cl = op
357	sall	%cl, %eax					// 1 << op
358	decl	%eax						// ((1U << op) - 1)
359	andl	hold, %eax					// (unsigned)hold & ((1U << op) - 1)
360	addl	%edx, %eax					// dist += (unsigned)hold & ((1U << op) - 1);
361
362#ifdef INFLATE_STRICT
363
364	cmpl	dmax, %eax						// dist vs dmax
365	ja		Linvalid_distance_too_far_back	// if (dist > dmax) break for invalid distance too far back
366
367#endif
368
369	movl	%eax, dist						// save a copy of dist in stack
370	shrl	%cl, hold						// hold >>= op;
371	subl	%ecx, bits						// bits -= op;
372
373	movl	out, %eax
374	subl	beg, %eax						// eax = op = out - beg
375	cmpl	%eax, dist						// dist vs op
376	jbe		Lcopy_direct_from_output		// if (dist <= op) branch to copy direct from output
377
378											// if (dist > op) {
379	movl	dist, %ecx						//	dist
380	subl	%eax, %ecx						//	esi = op = dist - op;
381	cmpl	%ecx, whave						//  whave vs op
382	jb		Linvalid_distance_too_far_back	//  if (op > whave) break for error;
383
384	movl	write, %edx
385	testl	%edx, %edx
386	jne		Lwrite_non_zero					// if (write==0) {
387	movl	wsize, %eax						//		wsize
388	subl	%ecx, %eax						//		wsize-op
389	movl	window, %esi					//		from=window-OFF
390	addl	%eax, %esi						//		from += wsize-op
391	movl	out, %edx						//		out
392	cmpl	%ecx, len						//		len vs op
393	jbe		L38								// 		if !(op < len) skip
394    subl    %ecx, len						// len - op
3950:											// do {
396	movzbl  (%esi), %eax					//
397    movb    %al, (%edx)						//
398    incl    %edx							//
399    incl    %esi							//  	PUP(out) = PUP(from);
400    decl    %ecx							//		--op;
401    jne     0b								// } while (op);
402
403    movl    %edx, out						// update out
404    movl    %edx, %esi						// out
405    subl    dist, %esi						// esi = from = out - dist;
406
407L38:			/* copy from output */
408
409			//		while (len > 2) {
410            //            PUP(out) = PUP(from);
411            //            PUP(out) = PUP(from);
412            //            PUP(out) = PUP(from);
413            //            len -= 3;
414            //        }
415            //        if (len) {
416            //            PUP(out) = PUP(from);
417            //            if (len > 1)
418            //                PUP(out) = PUP(from);
419            //       }
420
421	movl	len, %ecx						// len
422	movl	out, %edx						// out
423	subl	$3, %ecx						// pre-decrement len by 3
424	jl		1f								// if len < 3, branch to 1f for remaining processing
4250:											// while (len>2) {
426	movzbl	(%esi), %eax
427	movb	%al, (%edx)						// 		PUP(out) = PUP(from);
428	movzbl	1(%esi), %eax
429	movb	%al, 1(%edx)					//		PUP(out) = PUP(from);
430	movzbl	2(%esi), %eax
431	movb	%al, 2(%edx)					//		PUP(out) = PUP(from);
432	addl	$3, %esi						//		from += 3;
433	addl	$3, %edx						//		out += 3;
434	subl	$3, %ecx						//		len -= 3;
435	jge		0b								// }
436	movl	%edx, out						// update out, in case len == 0
4371:
438	addl	$3, %ecx						// post-increment len by 3
439	je		L_tst_do_while_loop_end			// if (len) {
440	movzbl	(%esi), %eax					//
441	movb	%al, (%edx)						//		PUP(out) = PUP(from);
442	incl	%edx							//		out++
443	movl	%edx, out						//		update out, in case len == 1
444	cmpl	$2, %ecx						//
445	jne		L_tst_do_while_loop_end			//		if len==1, break
446	movzbl	1(%esi), %eax
447	movb	%al, (%edx)						//		PUP(out) = PUP(from);
448	incl	%edx							//		out++
449	movl	%edx, out						//		update out
450	jmp		L_tst_do_while_loop_end			//	}
451
452	.align 4,0x90
453length_2nd_level_else:
454	andl	$32, %ecx						// test end-of-block
455	je		invalid_literal_length_code		// if (op&32)==0, branch for invalid literal/length code break
456	movl	state, %edx						// if (op&32), end-of-block is detected
457	movl	$11, (%edx)						// state->mode = TYPE
458	jmp		return_unused_bytes
459
460L70:
461	movl	out, %edx						// out
462	subl	%edx, end						// (end-out)
463	movl	end, %esi						// %esi = (end-out) = -(out - end);
464	leal	257(%esi), %eax					// %eax = 257 + %esi = 257 - (out -end)
465	jmp		L72								// return to update state and return
466
467L67:										// %edx = in, to return 5 - (in - last) in %eax
468	subl	%edx, last						// last - in
469	movl	last, %edx						// %edx = last - in = - (in - last);
470	leal	5(%edx), %eax					// %eax = 5 + %edx = 5 - (in - last);
471	jmp		L69								// return to update state and return
472
473bits_le_14:
474#if 1
475	leal	8(bits), %esi				// esi = bits+8
476	movl	in, %eax					// eax = in
477	movzbl	(%eax), %edx				// edx = *in++
478	movl	bits, %ecx					// cl = bits
479	sall	%cl, %edx					// 1st *in << bits
480	addl	hold, %edx					// hold += 1st *in << bits
481	movzbl	1(%eax), %eax				// 2nd *in
482	movl	%esi, %ecx					// cl = bits+8
483	sall	%cl, %eax					// 2nd *in << (bits+8)
484	addl	%eax, %edx					// hold += 2nd *in << (bits+8)
485	movl	%edx, hold					// update hold
486	addl	$2, in						// in += 2
487	addl	$16, bits					// bits += 16;
488	jmp	L19
489#else
490	/* this code segment does not run as fast as the other original code segment, possibly the processor
491		need extra time to handle unaligned short access */
492	movl    in, %edx                    //          unsigned short *inp = (unsigned short *) (in+OFF);
493    movzwl  (%edx), %eax                //          *((unsigned short *) in);
494    movl    bits, %ecx                  //          bits
495    sall    %cl, %eax                   //          *((unsigned short *) in) << bits
496    addl    %eax, hold                  //          hold += (unsigned long) *((unsigned short *) in) << bits;
497    addl    $2, %edx                    //          in += 2;
498    addl    $16, %ecx                   //          bits += 16;
499	movl	%edx, in
500	movl	%ecx, bits
501	jmp	L19
502#endif
503invalid_literal_length_code:
504    call    0f
5050:	popl    %eax
506	leal	LC2-0b(%eax), %eax
507	movl	%eax, 24(strm)
508	movl	state, %esi
509	movl	$27, (%esi)
510	jmp		return_unused_bytes
511Linvalid_distance_code:
512    call    0f
5130:	popl    %eax
514	leal	LC1-0b(%eax), %eax
515	movl	%eax, 24(strm)
516	movl	state, %eax
517	movl	$27, (%eax)
518	jmp		return_unused_bytes
519
520#ifdef	INFLATE_STRICT
521	.align	4,0x90
522	.byte	0
523	.byte	0
524	.byte	0
525	.byte	0
526	.byte	0
527	.byte	0
528	.byte	0
529	.byte	0
530	.byte	0
531#endif
532Lcopy_direct_from_output:
533	movl	out, %edx							// out
534	subl	dist, %edx							// from = out - dist
535	movl	out, %ecx							// out
536	movl	len, %esi							// len
537	subl	$3, %esi							// pre-decement len by 3
5380:												// do {
539	movzbl	(%edx), %eax
540	movb	%al, (%ecx)							// 	PUP(out) = PUP(from);
541	movzbl	1(%edx), %eax
542	movb	%al, 1(%ecx)						// 	PUP(out) = PUP(from);
543	movzbl	2(%edx), %eax
544	movb	%al, 2(%ecx)						// 	PUP(out) = PUP(from);
545	addl	$3, %edx							// 	from += 3
546	addl	$3, %ecx							// 	out += 3
547	subl	$3, %esi							// 	len -= 3
548	jge		0b									// } while (len > 2);
549	movl	%ecx, out							// update out in case len == 0
550	addl	$3, %esi							// post-increment len by 3
551	je		L_tst_do_while_loop_end				// if (len) {
552	movzbl	(%edx), %eax
553	movb	%al, (%ecx)							//		PUP(out) = PUP(from);
554	incl	%ecx
555	movl	%ecx, out							//		out++
556	cmpl	$2, %esi							//
557	jne		L_tst_do_while_loop_end				//		if (len>2)
558	movzbl	1(%edx), %eax
559	movb	%al, (%ecx)							//			PUP(out) = PUP(from);
560	incl	%ecx
561	movl	%ecx, out							//			out++
562	jmp		L_tst_do_while_loop_end				// }
563
564	.align 4,0x90
565Lwrite_non_zero:								// %edx = write, %ecx = op
566	movl	window, %esi						// from = window - OFF;
567	cmp		%ecx, %edx							// write vs op, test for wrap around window or contiguous in window
568	jae		Lcontiguous_in_window				// if (write >= op) branch to contiguous in window
569
570Lwrap_around_window: 							// wrap around window
571	addl	write_wsize, %esi					// from += write+wsize
572	subl	%ecx, %esi							// from += wsize + write - op;
573	subl	%edx, %ecx							// op -= write
574	cmpl	%ecx, len							// len vs op
575	jbe		L38									// if (len <= op) break to copy from output
576	subl	%ecx, len							// len -= op;
577	movl	out, %edx							// out
5780:												// do {
579	movzbl	(%esi), %eax						// 	*from
580	movb	%al, (%edx)							// 	*out
581	incl	%esi								// 	from++
582	incl	%edx								// 	out++
583	decl	%ecx								// 	--op
584	jne		0b									// } while (op);
585
586	movl	%edx, out							// save out in case we need to break to L38
587	movl	window, %esi						// from = window - OFF;
588	movl	len, %eax							// len
589	cmpl	%eax, write							// write vs len
590	jae		L38									// if (write >= len) break to L38
591
592	movl	write, %ecx							// op = write
593	subl	%ecx, len							// len -= op;
5940:												// do {
595	movzbl	(%esi), %eax						//	*from
596	movb	%al, (%edx)							//  *out
597	incl	%esi								//  from++
598	incl	%edx								//	out++
599	decl	%ecx								//  --op
600	jne		0b									// } while (op);
601
602	movl	%edx, %esi							// from = out
603	movl	%edx, out							// save a copy of out
604	subl	dist, %esi							// from = out - dist;
605	jmp		L38									// break to copy from output
606
607Lcontiguous_in_window:								// contiguous in window, edx = write, %ecx = op
608	subl	%ecx, %edx								// write - op
609	addl	%edx, %esi								// from += write - op;
610	cmpl	%ecx, len								// len vs op
611	jbe		L38										// if (len <= op) break to copy from output
612	movl	out, %edx								// out
613	subl	%ecx, len								// len -= op;
614
6150:													// do {
616	movzbl	(%esi), %eax							// 	*from
617	movb	%al, (%edx)								// 	*out
618	incl	%esi									// 	from++
619	incl	%edx									// 	out++
620	decl	%ecx									// 	op--
621	jne		0b										// } while (op);
622
623	movl	%edx, out								// update out
624	movl	%edx, %esi								// from = out
625	subl	dist, %esi								// from = out - dist;
626	jmp		L38
627
628Linvalid_distance_too_far_back:
629    call    0f
6300:	popl    %eax
631	leal	LC0-0b(%eax), %eax
632	movl	%eax, 24(strm)
633	movl	state, %ecx
634	movl	$27, (%ecx)
635	jmp		return_unused_bytes
636
637#endif
638
639#if (defined __x86_64__)
640	.cstring
641LC0:
642	.ascii "invalid distance too far back\0"
643LC1:
644	.ascii "invalid distance code\0"
645LC2:
646	.ascii "invalid literal/length code\0"
647	.text
648	.align 4,0x90
649
650#ifdef  INFLATE_STRICT
651	.byte 0
652	.byte 0
653	.byte 0
654	.byte 0
655	.byte 0
656	.byte 0
657	.byte 0
658	.byte 0
659	.byte 0
660	.byte 0
661	.byte 0
662	.byte 0
663#endif
664
665.globl _inflate_fast
666_inflate_fast:
667
668	// set up rbp
669	pushq	%rbp
670	movq	%rsp, %rbp
671
672	// save registers in stack
673	pushq	%r15
674	pushq	%r14
675	pushq	%r13
676	pushq	%r12
677	pushq	%rbx
678
679	#define	strm		%r13
680	#define	state		%rdi
681	#define	in			%r12
682	#define	in_d		%r12d
683	#define	out			%r10
684	#define	out_d		%r10d
685	#define	write		%r15d
686	#define hold		%r9
687	#define holdd		%r9d
688	#define	bits		%r8d
689	#define	lcode		%r14
690	#define	len			%ebx
691	#define from		%rcx
692	#define	dmax		%r11d
693
694	#define	last		-104(%rbp)
695	#define	beg			-96(%rbp)
696	#define	end			-88(%rbp)
697	#define	wsize		-80(%rbp)
698	#define	whave		-76(%rbp)
699	#define	window		-72(%rbp)
700	#define	dcode		-64(%rbp)
701	#define	lmask		-56(%rbp)
702	#define	dmask		-112(%rbp)
703	#define	wsize_write	-116(%rbp)
704	#define	write_1		-128(%rbp)
705	#define	dist		-44(%rbp)
706
707	// reserve stack memory for local variables 128-40=88
708	subq	$88, %rsp
709
710	movq	%rdi, strm
711	movq	56(%rdi), state						// state = (struct inflate_state FAR *)strm->state;
712	movq	(strm), in							// in = strm->next_in - OFF;
713	movl	8(strm), %eax						// strm->avail_in
714	subl	$5, %eax							// (strm->avail_in - 5)
715	addq	in, %rax							// in + (strm->avail_in - 5)
716	movq	%rax, last							// last = in + (strm->avail_in - 5)
717	movq	24(strm), out						// out = strm->next_out
718	movl	32(strm), %eax						// strm->avail_out
719	subl	%eax, %esi							// (start - strm->avail_out);
720	movq	out, %rdx							// strm->next_out
721	subq	%rsi, %rdx							// out - (start - strm->avail_out);
722	movq	%rdx, beg							// beg = out - (start - strm->avail_out);
723	subl	$257, %eax							// (strm->avail_out - 257)
724	addq	out, %rax							// out + (strm->avail_out - 257);
725	movq	%rax, end							// end = out + (strm->avail_out - 257);
726
727#ifdef INFLATE_STRICT
728	movl	20(state), dmax						// dmax = state->dmax;
729#endif
730
731	movl	52(state), %ecx						// state->wsize
732	movl	%ecx, wsize							// wsize = state->wsize;
733	movl	56(state), %ebx						// state->whave;
734	movl	%ebx, whave							// whave = state->whave;
735	movl	60(state), write					// write = state->write;
736	movq	64(state), %rax						// state->window
737	movq	%rax, window						// window = state->window;
738	movq	72(state), hold						// hold = state->hold;
739	movl	80(state), bits						// bits = state->bits;
740
741	movq	96(state), lcode					// lcode = state->lencode;
742	movq	104(state), %rdx					// state->distcode;
743	movq	%rdx, dcode							// dcode = state->distcode;
744
745	movl	116(state), %ecx					// state->distbits
746	movl	$1, %eax
747	movl	%eax, %edx							// 1
748	sall	%cl, %edx							// (1U << state->distbits)
749	movl	112(state), %ecx					// state->lenbits
750	sall	%cl, %eax							// (1U << state->lenbits)
751	decl	%eax								// (1U << state->lenbits) - 1
752	movq	%rax, lmask							// lmask = (1U << state->lenbits) - 1
753	decl	%edx								// (1U << state->distbits) - 1
754	movq	%rdx, dmask							// dmask = (1U << state->distbits) - 1
755
756	movl	wsize, %ecx							// wsize
757	addl	write, %ecx							// wsize + write
758	movl	%ecx, wsize_write					// wsize_write = wsize + write
759
760	leal	-1(%r15), %ebx						// write - 1
761	movq	%rbx, write_1						// write_1 = write - 1
762
763L_do_while_loop:
764	cmpl	$14, bits							// bits vs 14
765	ja		0f									// if (bits < 15) {
766	movzwl	(in), %eax							//		read 2 bytes from in
767	movl	bits, %ecx							//		set up cl = bits
768	salq	%cl, %rax							//		(*in) << bits
769	addq	%rax, hold							// 		hold += (*in) << bits
770	addq	$2, in								//		in += 2
771	addl	$16, bits							//		bits += 16
7720:												// }
773	movq	lmask, %rax							//	lmask
774	andq	hold, %rax							//	hold & lmask
775	jmp		1f
776	.align 4,0x90
777Lop_nonzero:
778	movzbl	%al, %ecx							// op in al and cl
779	testb	$16, %cl							// check for length base processing (op&16)
780	jne		L_length_base						// if (op&16) branch to length base processing
781	testb	$64, %cl							// check for 2nd level length code (op&64==0)
782	jne		L_end_of_block						// if (op&64)!=0, branch for end-of-block processing
783
784	/* 2nd level length code : (op&64) == 0*/
785L_2nd_level_length_code:
786	movl	$1, %eax							// 1
787	sall	%cl, %eax							// 1 << op
788	decl	%eax								// ((1U << op) - 1)
789	andq	hold, %rax							// (hold & ((1U << op) - 1))
790	movzwl	%dx, %edx
791	addq	%rdx, %rax							// this = lcode[this.val + (hold & ((1U << op) - 1))];
7921:
793	movl	(lcode,%rax,4), %eax				// this = lcode[hold & lmask];
794Ldolen:
795	movl	%eax, %edx							// a copy of this
796	shrl	$16, %edx							// edx = this.val;
797	movzbl	%ah, %ecx							// op = this.bits
798	shrq	%cl, hold							// hold >>= op;
799	subl	%ecx, bits							// bits -= op;
800	testb	%al, %al							// op = (unsigned)(this.op);
801	jne		Lop_nonzero							// if (op!-0) branch for copy operation
802L_literal:
803	movb	%dl, (out)							// *out = this.val
804	incq	out									// out ++
805L_do_while_loop_check:
806	cmpq	last, in							// in vs last
807	jae		L_return_unused_byte				// if in >= last, break to return unused byte processing
808	cmpq	end, out							// out vs end
809	jb		L_do_while_loop						// back to do_while_loop if out < end
810
811	/* return unused bytes (on entry, bits < 8, so in won't go too far back) */
812
813L_return_unused_byte:
814	movl	out_d, %esi
815	jmp		L34
816
817L_length_base:				/* al = cl = op, edx = this.val, op&16 = 16 */
818	movzwl	%dx, len							// len = (unsigned)(this.val);
819	movl	%ecx, %edx							// op
820	andl	$15, %edx							// op &= 15;
821	je		1f									// if (op) {
822	cmpl	bits, %edx							//		op vs bits
823	jbe		0f									//		if (bits < op) {
824	movzbl	(in), %eax							//			*in
825	movl	bits, %ecx							//			cl = bits
826	salq	%cl, %rax							//			*in << bits
827	addq	%rax, hold							//			hold += (unsigned long)(PUP(in)) << bits;
828	incq	in									//			in++
829	addl	$8, bits							//			bits += 8
8300:												//		}
831	movl	$1, %eax							//		1
832	movl	%edx, %ecx							//		cl = op
833	sall	%cl, %eax							//		1 << op
834	decl	%eax								//		(1 << op) - 1
835	andl	holdd, %eax							//		 (unsigned)hold & ((1U << op) - 1);
836	addl	%eax, len							//		len += (unsigned)hold & ((1U << op) - 1);
837	shrq	%cl, hold							//		hold >>= op;
838	subl	%edx, bits							//		bits -= op;
8391:												// }
840	cmpl	$14, bits							// bits vs 14
841	jbe		L99									// if (bits < 15) go to loading to hold and return to L19
842L19:												// }
843	movq	dmask, %rax							// dmask
844	andq	hold, %rax							// hold & dmask
845	movq	dcode, %rdx							// dcode[]
846	movl	(%rdx,%rax,4), %eax					// this = dcode[hold & dmask];
847	jmp		L_dodist
848	.align 4,0x90
8490:												// op&16 == 0, test (op&64)==0 for 2nd level distance code
850	testb	$64, %cl							// op&64
851	jne		L_invalid_distance_code				// if ((op&64)==0) { /* 2nd level distance code */
852	movl	$1, %eax							//	1
853	sall	%cl, %eax							//  1 << op
854	decl	%eax								// (1 << op) - 1
855	andq	hold, %rax							// (hold & ((1U << op) - 1))
856	movzwl	%dx, %edx							// this.val
857	addq	%rdx, %rax							// this.val + (hold & ((1U << op) - 1))
858	movq	dcode, %rcx							// dcode[]
859	movl	(%rcx,%rax,4), %eax					// this = dcode[this.val + (hold & ((1U << op) - 1))];
860L_dodist:
861	movl	%eax, %edx							// this
862	shrl	$16, %edx							// dist = (unsigned)(this.val);
863	movzbl	%ah, %ecx							// cl = op = this.bits
864	shrq	%cl, hold							// hold >>= op;
865	subl	%ecx, bits							// bits -= op;
866	movzbl	%al, %ecx							// op = (unsigned)(this.op);
867	testb	$16, %cl							// (op & 16)	test for distance base
868	je		0b									// if (op&16) == 0, branch to check for 2nd level distance code
869
870L_distance_base:								/* distance base */
871
872	movl	%ecx, %esi							// op
873	andl	$15, %esi							// op&=15
874	cmpl	bits, %esi							// op vs bits
875	jbe		1f									// if (bits < op) {
876	movzbl	(in), %eax							//		*in
877	movl	bits, %ecx							//		cl = bits
878	salq	%cl, %rax							//		*in << bits
879	addq	%rax, hold							//		hold += (unsigned long)(PUP(in)) << bits;
880	incq	in									//		in++
881	addl	$8, bits							//		bits += 8
882	cmpl	bits, %esi							//		op vs bits
883	jbe		1f									//		if (bits < op) {
884	movzbl	(in), %eax							//			*in
885	movl	bits, %ecx							//			cl = bits
886	salq	%cl, %rax							//			*in << bits
887	addq	%rax, hold							//			hold += (unsigned long)(PUP(in)) << bits;
888	incq	in									//			in++
889	addl	$8, bits							//			bits += 8
8901:												// }	}
891
892	movzwl	%dx, %edx							// dist
893	movl	$1, %eax							// 1
894	movl	%esi, %ecx							// cl = op
895	sall	%cl, %eax							// (1 << op)
896	decl	%eax								// (1 << op) - 1
897	andl	holdd, %eax							// (unsigned)hold & ((1U << op) - 1)
898	addl	%edx, %eax							// dist += (unsigned)hold & ((1U << op) - 1);
899	movl	%eax, dist							// save a copy of dist in stack
900
901#ifdef INFLATE_STRICT
902	cmp		%eax, dmax							// dmax vs dist
903	jb		L_invalid_distance_too_far_back		// if (dmax < dist) break for invalid distance too far back
904#endif
905
906	shrq	%cl, hold							// hold >>= op;
907	subl	%esi, bits							// bits -= op;
908	movl	out_d, %esi							// out
909	movl	out_d, %eax							// out
910	subl	beg, %eax							// op = out - beg
911	cmpl	%eax, dist							// dist vs op,  /* see if copy from window */
912	jbe		L_copy_direct_from_output			// if (dist <= op) branch to copy direct from output
913
914L_distance_back_in_window:
915
916	movl	dist, %edx							// dist
917	subl	%eax, %edx							// op = dist - op;	/* distance back in window */
918
919	cmpl	%edx, whave							// whave vs op
920	jb		L_invalid_distance_too_far_back		// if (op > whave), break for invalid distance too far back
921
922	testl	write, write						// if (write!=0)
923	jne		L_wrap_around_window				//		branch to wrap around window
924
925L_very_common_case:
926
927	movl	wsize, %eax							//	wsize
928	subl	%edx, %eax							//	wsize - op
929	movq	window, from						//	from = window - OFF;
930	addq	%rax, from							//	from += wsize - op;
931
932	movl	%edx, %esi							//  op
933	cmpl	%edx, len							//  len vs op
934	ja		L_some_from_window					//  if (len > op), branch for aligned code block L_some_from_window
935L38:
936	subl	$3, len								// pre-decrement len by 3
937	jge		0f									// if len >= 3, branch to the aligned code block
9381:	addl	$3, len								// post-increment len by 3
939	je		L_do_while_loop_check				// if (len==0) break to L_do_while_loop_check
940	movzbl	(from), %eax						// *from
941	movb	%al, (out)							// *out
942	incq	out									// out++
943	cmpl	$2, len								// len vs 2
944	jne		L_do_while_loop_check				// if len!=2 break to L_do_while_loop_check
945	movzbl	1(from), %eax						// *from
946	movb	%al, (out)							// *out
947	incq	out									// out++
948	jmp		L_do_while_loop_check				// break to L_do_while_loop_check
949
950	.align 4,0x90
9510:												// do {
952	movzbl	(from), %eax						//		*from
953	movb	%al, (out)							//		*out
954	movzbl	1(from), %eax						//		*from
955	movb	%al, 1(out)							//		*out
956	movzbl	2(from), %eax						//		*from
957	movb	%al, 2(out)							//		*out
958	addq	$3, out								//		out += 3
959	addq	$3, from							//		from += 3
960	subl	$3, len								//		len -= 3
961	jge		0b									// } while (len>=0);
962	jmp		1b									// branch back to the possibly unaligned code
963
964	.align 4,0x90
965L_end_of_block:
966	andl	$32, %ecx							// op & 32
967	jne		L101								// if (op&32) branch to end-of-block break
968	leaq	LC2(%rip), from
969	movq	from, 48(strm)						// state->mode
970	movl	$27, (state)						// state->mode = BAD;
971	movl	out_d, %esi
972
973L34:
974	movl	bits, %eax							// bits
975	shrl	$3, %eax							// len = bits >> 3;
976	mov		%eax, %edx							// len
977	subq	%rdx, in							// in -= len
978	sall	$3, %eax							// len << 3
979	movl	bits, %ecx							// bits
980	subl	%eax, %ecx							// bits -= len << 3
981	movq	in, (strm)							// strm->next_in = in + OFF;
982	movq	out, 24(strm)						// strm->next_out = out + OFF;
983	cmpq	in, last							// last vs in
984	jbe		L67									// if (last <= in) branch to L67 and return to L69
985	movl	last, %eax							// last
986	addl	$5, %eax							// last + 5
987	subl	in_d, %eax							// 5 + last - in
988L69:
989	movl	%eax, 8(strm)						// update strm->avail_in
990
991	cmpq	end, out							// out vs end
992	jae		L70									// if out<=end branch to L70 and return to L72
993	movl	end, %eax							// end
994	addl	$257, %eax							// 257 + end
995	subl	%esi, %eax							// 257 + end - out;
996L72:
997	movl	%eax, 32(strm)						// update strm->avail_out
998
999	movl	$1, %eax							// 1
1000	sall	%cl, %eax							// 1 << bits
1001	decl	%eax								// (1U << bits) - 1
1002	andq	hold, %rax							// hold &= (1U << bits) - 1;
1003	movq	%rax, 72(state)						// state->hold = hold;
1004	movl	%ecx, 80(state)						// state->bits = bits;
1005
1006	// clear stack memory for local variables
1007	addq	$88, %rsp
1008
1009	// restore registers from stack
1010	popq	%rbx
1011	popq	%r12
1012	popq	%r13
1013	popq	%r14
1014	popq	%r15
1015
1016	// return to caller
1017	leave
1018	ret
1019
1020	.align 4,0x90
1021L99:
1022	leal	8(bits), %esi						//		esi = bits+8
1023	movzbl	(in), %edx							//		1st *in
1024	movl	bits, %ecx							//		cl = bits
1025	salq	%cl, %rdx							//		1st *in << 8
1026	addq	%rdx, hold							// 		1st hold += (unsigned long)(PUP(in)) << bits;
1027	movzbl	1(in), %eax							//		2nd *in
1028	movl	%esi, %ecx							//		cl = bits + 8
1029	salq	%cl, %rax							//		2nd *in << bits+8
1030	addq	%rax, hold							// 		2nd hold += (unsigned long)(PUP(in)) << bits;
1031	addq	$2, in								//		in += 2
1032	addl	$16, bits							//		bits += 16
1033	jmp		L19
1034
1035L101:
1036	movl	$11, (state)
1037	movl	out_d, %esi
1038	jmp	L34
1039	.align 4,0x90
1040L70:
1041	movl	end, %eax							// end
1042	subl	%esi, %eax							// end - out
1043	addl	$257, %eax							// 257 + end - out
1044	jmp		L72
1045	.align 4,0x90
1046L67:
1047	movl	last, %eax							// last
1048	subl	in_d, %eax							// last - in
1049	addl	$5, %eax							// 5 + last - in
1050	jmp		L69
1051
1052
1053	.align 4,0x90
1054
1055	// stuffing the following 4 bytes to align the major loop to a 16-byte boundary to give the better performance
1056	.byte 0
1057	.byte 0
1058	.byte 0
1059	.byte 0
1060L_copy_direct_from_output:
1061	mov		dist, %eax						// dist
1062	movq	out, %rdx						// out
1063	subq	%rax, %rdx						// from = out - dist;
1064	subl	$3, len							// pre-decrement len by 3
1065											// do {
10660:	movzbl	(%rdx), %eax					// 	*from
1067	movb	%al, (out)						//	*out
1068	movzbl	1(%rdx), %eax					//	*from
1069	movb	%al, 1(out)						//	*out
1070	movzbl	2(%rdx), %eax					//	*from
1071	movb	%al, 2(out)						//	*out
1072	addq	$3, out							//	out+=3
1073	addq	$3, %rdx						//  from+=3
1074	subl	$3, len							//  len-=3
1075	jge		0b								// } while (len>=0);
10761:	addl	$3, len							// post-increment len by 3
1077	je		L_do_while_loop_check			// if len==0, branch to do_while_loop_check
1078
1079	movzbl	(%rdx), %eax					// *from
1080	movb	%al, (out)						// *out
1081	incq	out								// out++
1082	cmpl	$2, len							// len == 2 ?
1083	jne		L_do_while_loop_check			// if len==1, branch to do_while_loop_check
1084
1085	movzbl	1(%rdx), %eax					// *from
1086	movb	%al, (out)						// *out
1087	incq	out								// out++
1088	jmp	L_do_while_loop_check				// branch to do_while_loop_check
1089
1090	.align 4,0x90
1091L_some_from_window:		// from : from, out, %esi/%edx = op
1092									// do {
1093	movzbl	(from), %eax			// 	*from
1094	movb	%al, (out)				// 	*out
1095	incq	from					// 	from++
1096	incq	out						// 	out++
1097	decl	%esi					// 	--op
1098	jne		L_some_from_window		// } while (op);
1099	subl	%edx, len				// len -= op;
1100	mov		dist, %eax				// dist
1101	movq	out, from				// out
1102	subq	%rax, from				// from = out - dist;
1103	jmp		L38						// copy from output
1104
1105	.align 4,0x90
1106L_wrap_around_window:
1107	cmpl	%edx, write					// write vs op
1108	jae		L_contiguous_in_window		// if (write >= op) branch to contiguous in window
1109	movl	wsize_write, %eax			// wsize+write
1110	subl	%edx, %eax					// wsize+write-op
1111	movq	window, from				// from = window - OFF
1112	addq	%rax, from					// from += wsize+write-op
1113	subl	write, %edx					// op -= write
1114	cmpl	%edx, len					// len vs op
1115	jbe		L38							// if (len<=op) branch to copy from output
1116
1117	subl	%edx, len					// len -= op;
11180:										// do {
1119	movzbl	(from), %eax				//		*from
1120	movb	%al, (out)					//		*out
1121	incq	from						//		from++
1122	incq	out							//		out++
1123	decl	%edx						//		op--
1124	jne		0b							// } while (op);
1125	movq	window, from
1126
1127	cmpl	len, write					// write vs len
1128	jae		L38							// if (write >= len) branch to copy from output
1129	movl	write, %esi					// op = write
1130	subl	write, len					// len -= op
11311:										// do {
1132	movzbl	(from), %eax				//		*from
1133	movb	%al, (out)					//		*out
1134	incq	from						//		from++
1135	incq	out							//		out++
1136	decl	%esi						//		op--
1137	jne		1b							// } while (op);
1138	mov		dist, %eax					// dist
1139	movq	out, from					// out
1140	subq	%rax, from					// from = out - dist;
1141	jmp		L38
1142
1143	.align 4,0x90
1144L_contiguous_in_window:
1145	movl	write, %eax					// write
1146	subl	%edx, %eax					// write - op
1147	movq	window, from				// from = window - OFF
1148	addq	%rax, from					// from += write - op
1149	cmpl	%edx, len					// len vs op
1150	jbe		L38							// if (len <= op) branch to copy from output
1151	subl    %edx, len					// len -= op;
11522:										// do {
1153	movzbl	(from), %eax				// 	*from
1154	movb	%al, (out)					// 	*out
1155	incq	from						// 	from++
1156	incq	out							// 	out++
1157	decl	%edx						// 	op--
1158	jne		2b							// } while (op);
1159
1160	mov		dist, %eax					// dist
1161	movq	out, from					// out
1162	subq	%rax, from					// from = out - dist;
1163	jmp		L38							// copy from output
1164
1165	.align 4,0x90
1166L_invalid_distance_code:
1167	leaq	LC1(%rip), %rdx
1168	movq	%rdx, 48(strm)
1169	movl	$27, (state)
1170	movl	out_d, %esi
1171	jmp		L34
1172
1173L_invalid_distance_too_far_back:
1174	leaq	LC0(%rip), %rbx
1175	movq	%rbx, 48(strm)				// error message
1176	movl	$27, (state)				// state->mode = BAD
1177	jmp		L34
1178
1179#endif
1180