x86-mont.s revision 285830
1204076Spjd	# $FreeBSD: releng/10.2/secure/lib/libcrypto/i386/x86-mont.s 238405 2012-07-12 19:30:53Z jkim $
2204076Spjd.file	"x86-mont.s"
3211885Spjd.text
4204076Spjd.globl	bn_mul_mont
5204076Spjd.type	bn_mul_mont,@function
6204076Spjd.align	16
7204076Spjdbn_mul_mont:
8204076Spjd.L_bn_mul_mont_begin:
9204076Spjd	pushl	%ebp
10204076Spjd	pushl	%ebx
11204076Spjd	pushl	%esi
12204076Spjd	pushl	%edi
13204076Spjd	xorl	%eax,%eax
14204076Spjd	movl	40(%esp),%edi
15204076Spjd	cmpl	$4,%edi
16204076Spjd	jl	.L000just_leave
17204076Spjd	leal	20(%esp),%esi
18204076Spjd	leal	24(%esp),%edx
19204076Spjd	movl	%esp,%ebp
20204076Spjd	addl	$2,%edi
21204076Spjd	negl	%edi
22204076Spjd	leal	-32(%esp,%edi,4),%esp
23204076Spjd	negl	%edi
24204076Spjd	movl	%esp,%eax
25204076Spjd	subl	%edx,%eax
26204076Spjd	andl	$2047,%eax
27204076Spjd	subl	%eax,%esp
28204076Spjd	xorl	%esp,%edx
29204076Spjd	andl	$2048,%edx
30204076Spjd	xorl	$2048,%edx
31204076Spjd	subl	%edx,%esp
32204076Spjd	andl	$-64,%esp
33204076Spjd	movl	(%esi),%eax
34204076Spjd	movl	4(%esi),%ebx
35211885Spjd	movl	8(%esi),%ecx
36204076Spjd	movl	12(%esi),%edx
37204076Spjd	movl	16(%esi),%esi
38204076Spjd	movl	(%esi),%esi
39211885Spjd	movl	%eax,4(%esp)
40204076Spjd	movl	%ebx,8(%esp)
41211885Spjd	movl	%ecx,12(%esp)
42211885Spjd	movl	%edx,16(%esp)
43211885Spjd	movl	%esi,20(%esp)
44211885Spjd	leal	-3(%edi),%ebx
45211885Spjd	movl	%ebp,24(%esp)
46204076Spjd	leal	OPENSSL_ia32cap_P,%eax
47204076Spjd	btl	$26,(%eax)
48204076Spjd	jnc	.L001non_sse2
49204076Spjd	movl	$-1,%eax
50211885Spjd	movd	%eax,%mm7
51204076Spjd	movl	8(%esp),%esi
52204076Spjd	movl	12(%esp),%edi
53204076Spjd	movl	16(%esp),%ebp
54204076Spjd	xorl	%edx,%edx
55211885Spjd	xorl	%ecx,%ecx
56204076Spjd	movd	(%edi),%mm4
57211885Spjd	movd	(%esi),%mm5
58211885Spjd	movd	(%ebp),%mm3
59211885Spjd	pmuludq	%mm4,%mm5
60211885Spjd	movq	%mm5,%mm2
61211885Spjd	movq	%mm5,%mm0
62211885Spjd	pand	%mm7,%mm0
63211885Spjd	pmuludq	20(%esp),%mm5
64211885Spjd	pmuludq	%mm5,%mm3
65211885Spjd	paddq	%mm0,%mm3
66211885Spjd	movd	4(%ebp),%mm1
67211885Spjd	movd	4(%esi),%mm0
68211885Spjd	psrlq	$32,%mm2
69211885Spjd	psrlq	$32,%mm3
70211885Spjd	incl	%ecx
71211885Spjd.align	16
72211885Spjd.L0021st:
73211885Spjd	pmuludq	%mm4,%mm0
74211885Spjd	pmuludq	%mm5,%mm1
75211885Spjd	paddq	%mm0,%mm2
76211885Spjd	paddq	%mm1,%mm3
77211885Spjd	movq	%mm2,%mm0
78211885Spjd	pand	%mm7,%mm0
79211885Spjd	movd	4(%ebp,%ecx,4),%mm1
80211885Spjd	paddq	%mm0,%mm3
81211885Spjd	movd	4(%esi,%ecx,4),%mm0
82211885Spjd	psrlq	$32,%mm2
83211885Spjd	movd	%mm3,28(%esp,%ecx,4)
84211885Spjd	psrlq	$32,%mm3
85211976Spjd	leal	1(%ecx),%ecx
86211976Spjd	cmpl	%ebx,%ecx
87211976Spjd	jl	.L0021st
88204076Spjd	pmuludq	%mm4,%mm0
89204076Spjd	pmuludq	%mm5,%mm1
90204076Spjd	paddq	%mm0,%mm2
91204076Spjd	paddq	%mm1,%mm3
92204076Spjd	movq	%mm2,%mm0
93204076Spjd	pand	%mm7,%mm0
94204076Spjd	paddq	%mm0,%mm3
95204076Spjd	movd	%mm3,28(%esp,%ecx,4)
96204076Spjd	psrlq	$32,%mm2
97204076Spjd	psrlq	$32,%mm3
98204076Spjd	paddq	%mm2,%mm3
99204076Spjd	movq	%mm3,32(%esp,%ebx,4)
100204076Spjd	incl	%edx
101204076Spjd.L003outer:
102211884Spjd	xorl	%ecx,%ecx
103211884Spjd	movd	(%edi,%edx,4),%mm4
104211884Spjd	movd	(%esi),%mm5
105211884Spjd	movd	32(%esp),%mm6
106211884Spjd	movd	(%ebp),%mm3
107211884Spjd	pmuludq	%mm4,%mm5
108211884Spjd	paddq	%mm6,%mm5
109211884Spjd	movq	%mm5,%mm0
110211884Spjd	movq	%mm5,%mm2
111211884Spjd	pand	%mm7,%mm0
112211884Spjd	pmuludq	20(%esp),%mm5
113211884Spjd	pmuludq	%mm5,%mm3
114211884Spjd	paddq	%mm0,%mm3
115211884Spjd	movd	36(%esp),%mm6
116211884Spjd	movd	4(%ebp),%mm1
117204076Spjd	movd	4(%esi),%mm0
118204076Spjd	psrlq	$32,%mm2
119204076Spjd	psrlq	$32,%mm3
120204076Spjd	paddq	%mm6,%mm2
121204076Spjd	incl	%ecx
122204076Spjd	decl	%ebx
123204076Spjd.L004inner:
124204076Spjd	pmuludq	%mm4,%mm0
125204076Spjd	pmuludq	%mm5,%mm1
126204076Spjd	paddq	%mm0,%mm2
127204076Spjd	paddq	%mm1,%mm3
128204076Spjd	movq	%mm2,%mm0
129204076Spjd	movd	36(%esp,%ecx,4),%mm6
130204076Spjd	pand	%mm7,%mm0
131204076Spjd	movd	4(%ebp,%ecx,4),%mm1
132204076Spjd	paddq	%mm0,%mm3
133204076Spjd	movd	4(%esi,%ecx,4),%mm0
134204076Spjd	psrlq	$32,%mm2
135204076Spjd	movd	%mm3,28(%esp,%ecx,4)
136204076Spjd	psrlq	$32,%mm3
137204076Spjd	paddq	%mm6,%mm2
138204076Spjd	decl	%ebx
139204076Spjd	leal	1(%ecx),%ecx
140204076Spjd	jnz	.L004inner
141204076Spjd	movl	%ecx,%ebx
142204076Spjd	pmuludq	%mm4,%mm0
143204076Spjd	pmuludq	%mm5,%mm1
144204076Spjd	paddq	%mm0,%mm2
145204076Spjd	paddq	%mm1,%mm3
146204076Spjd	movq	%mm2,%mm0
147204076Spjd	pand	%mm7,%mm0
148204076Spjd	paddq	%mm0,%mm3
149211885Spjd	movd	%mm3,28(%esp,%ecx,4)
150211885Spjd	psrlq	$32,%mm2
151211885Spjd	psrlq	$32,%mm3
152211885Spjd	movd	36(%esp,%ebx,4),%mm6
153211976Spjd	paddq	%mm2,%mm3
154211976Spjd	paddq	%mm6,%mm3
155211885Spjd	movq	%mm3,32(%esp,%ebx,4)
156211885Spjd	leal	1(%edx),%edx
157211885Spjd	cmpl	%ebx,%edx
158211885Spjd	jle	.L003outer
159211885Spjd	emms
160211976Spjd	jmp	.L005common_tail
161211976Spjd.align	16
162211976Spjd.L001non_sse2:
163211976Spjd	movl	8(%esp),%esi
164211976Spjd	leal	1(%ebx),%ebp
165211976Spjd	movl	12(%esp),%edi
166211976Spjd	xorl	%ecx,%ecx
167211976Spjd	movl	%esi,%edx
168211976Spjd	andl	$1,%ebp
169211976Spjd	subl	%edi,%edx
170211976Spjd	leal	4(%edi,%ebx,4),%eax
171211976Spjd	orl	%edx,%ebp
172211976Spjd	movl	(%edi),%edi
173211976Spjd	jz	.L006bn_sqr_mont
174211976Spjd	movl	%eax,28(%esp)
175211976Spjd	movl	(%esi),%eax
176211976Spjd	xorl	%edx,%edx
177211976Spjd.align	16
178211976Spjd.L007mull:
179211976Spjd	movl	%edx,%ebp
180211976Spjd	mull	%edi
181211976Spjd	addl	%eax,%ebp
182211885Spjd	leal	1(%ecx),%ecx
183211885Spjd	adcl	$0,%edx
184211885Spjd	movl	(%esi,%ecx,4),%eax
185211885Spjd	cmpl	%ebx,%ecx
186211885Spjd	movl	%ebp,28(%esp,%ecx,4)
187211885Spjd	jl	.L007mull
188211885Spjd	movl	%edx,%ebp
189211885Spjd	mull	%edi
190211885Spjd	movl	20(%esp),%edi
191211885Spjd	addl	%ebp,%eax
192211885Spjd	movl	16(%esp),%esi
193211885Spjd	adcl	$0,%edx
194211885Spjd	imull	32(%esp),%edi
195211885Spjd	movl	%eax,32(%esp,%ebx,4)
196211885Spjd	xorl	%ecx,%ecx
197211885Spjd	movl	%edx,36(%esp,%ebx,4)
198211885Spjd	movl	%ecx,40(%esp,%ebx,4)
199211885Spjd	movl	(%esi),%eax
200211885Spjd	mull	%edi
201211885Spjd	addl	32(%esp),%eax
202211885Spjd	movl	4(%esi),%eax
203211885Spjd	adcl	$0,%edx
204211885Spjd	incl	%ecx
205211885Spjd	jmp	.L0082ndmadd
206211885Spjd.align	16
207211885Spjd.L0091stmadd:
208211885Spjd	movl	%edx,%ebp
209211885Spjd	mull	%edi
210211885Spjd	addl	32(%esp,%ecx,4),%ebp
211211885Spjd	leal	1(%ecx),%ecx
212211885Spjd	adcl	$0,%edx
213211885Spjd	addl	%eax,%ebp
214211885Spjd	movl	(%esi,%ecx,4),%eax
215211885Spjd	adcl	$0,%edx
216211885Spjd	cmpl	%ebx,%ecx
217211885Spjd	movl	%ebp,28(%esp,%ecx,4)
218211885Spjd	jl	.L0091stmadd
219211885Spjd	movl	%edx,%ebp
220211885Spjd	mull	%edi
221211885Spjd	addl	32(%esp,%ebx,4),%eax
222211885Spjd	movl	20(%esp),%edi
223211885Spjd	adcl	$0,%edx
224211885Spjd	movl	16(%esp),%esi
225211885Spjd	addl	%eax,%ebp
226211885Spjd	adcl	$0,%edx
227211885Spjd	imull	32(%esp),%edi
228211885Spjd	xorl	%ecx,%ecx
229211885Spjd	addl	36(%esp,%ebx,4),%edx
230211885Spjd	movl	%ebp,32(%esp,%ebx,4)
231211885Spjd	adcl	$0,%ecx
232211885Spjd	movl	(%esi),%eax
233211885Spjd	movl	%edx,36(%esp,%ebx,4)
234211885Spjd	movl	%ecx,40(%esp,%ebx,4)
235211885Spjd	mull	%edi
236211885Spjd	addl	32(%esp),%eax
237211885Spjd	movl	4(%esi),%eax
238211885Spjd	adcl	$0,%edx
239211885Spjd	movl	$1,%ecx
240211885Spjd.align	16
241211885Spjd.L0082ndmadd:
242211885Spjd	movl	%edx,%ebp
243211885Spjd	mull	%edi
244211885Spjd	addl	32(%esp,%ecx,4),%ebp
245211885Spjd	leal	1(%ecx),%ecx
246211885Spjd	adcl	$0,%edx
247211885Spjd	addl	%eax,%ebp
248211885Spjd	movl	(%esi,%ecx,4),%eax
249211885Spjd	adcl	$0,%edx
250211885Spjd	cmpl	%ebx,%ecx
251211885Spjd	movl	%ebp,24(%esp,%ecx,4)
252211885Spjd	jl	.L0082ndmadd
253211885Spjd	movl	%edx,%ebp
254211885Spjd	mull	%edi
255211885Spjd	addl	32(%esp,%ebx,4),%ebp
256211885Spjd	adcl	$0,%edx
257211885Spjd	addl	%eax,%ebp
258211885Spjd	adcl	$0,%edx
259211885Spjd	movl	%ebp,28(%esp,%ebx,4)
260211885Spjd	xorl	%eax,%eax
261211885Spjd	movl	12(%esp),%ecx
262211885Spjd	addl	36(%esp,%ebx,4),%edx
263211885Spjd	adcl	40(%esp,%ebx,4),%eax
264211885Spjd	leal	4(%ecx),%ecx
265211885Spjd	movl	%edx,32(%esp,%ebx,4)
266211885Spjd	cmpl	28(%esp),%ecx
267211885Spjd	movl	%eax,36(%esp,%ebx,4)
268211976Spjd	je	.L005common_tail
269211976Spjd	movl	(%ecx),%edi
270211976Spjd	movl	8(%esp),%esi
271211976Spjd	movl	%ecx,12(%esp)
272211976Spjd	xorl	%ecx,%ecx
273211976Spjd	xorl	%edx,%edx
274211976Spjd	movl	(%esi),%eax
275211976Spjd	jmp	.L0091stmadd
276211976Spjd.align	16
277211976Spjd.L006bn_sqr_mont:
278211976Spjd	movl	%ebx,(%esp)
279211976Spjd	movl	%ecx,12(%esp)
280211976Spjd	movl	%edi,%eax
281211976Spjd	mull	%edi
282211976Spjd	movl	%eax,32(%esp)
283211976Spjd	movl	%edx,%ebx
284211976Spjd	shrl	$1,%edx
285211976Spjd	andl	$1,%ebx
286211976Spjd	incl	%ecx
287211976Spjd.align	16
288211976Spjd.L010sqr:
289211976Spjd	movl	(%esi,%ecx,4),%eax
290211976Spjd	movl	%edx,%ebp
291211976Spjd	mull	%edi
292211976Spjd	addl	%ebp,%eax
293211976Spjd	leal	1(%ecx),%ecx
294211976Spjd	adcl	$0,%edx
295211976Spjd	leal	(%ebx,%eax,2),%ebp
296213429Spjd	shrl	$31,%eax
297211885Spjd	cmpl	(%esp),%ecx
298211885Spjd	movl	%eax,%ebx
299211885Spjd	movl	%ebp,28(%esp,%ecx,4)
300211885Spjd	jl	.L010sqr
301211885Spjd	movl	(%esi,%ecx,4),%eax
302211885Spjd	movl	%edx,%ebp
303211885Spjd	mull	%edi
304211885Spjd	addl	%ebp,%eax
305211885Spjd	movl	20(%esp),%edi
306211885Spjd	adcl	$0,%edx
307211885Spjd	movl	16(%esp),%esi
308211885Spjd	leal	(%ebx,%eax,2),%ebp
309211885Spjd	imull	32(%esp),%edi
310211885Spjd	shrl	$31,%eax
311211885Spjd	movl	%ebp,32(%esp,%ecx,4)
312211885Spjd	leal	(%eax,%edx,2),%ebp
313211885Spjd	movl	(%esi),%eax
314211885Spjd	shrl	$31,%edx
315211885Spjd	movl	%ebp,36(%esp,%ecx,4)
316211885Spjd	movl	%edx,40(%esp,%ecx,4)
317211885Spjd	mull	%edi
318211885Spjd	addl	32(%esp),%eax
319211885Spjd	movl	%ecx,%ebx
320211885Spjd	adcl	$0,%edx
321211885Spjd	movl	4(%esi),%eax
322211885Spjd	movl	$1,%ecx
323211885Spjd.align	16
324211885Spjd.L0113rdmadd:
325211885Spjd	movl	%edx,%ebp
326211885Spjd	mull	%edi
327211885Spjd	addl	32(%esp,%ecx,4),%ebp
328211885Spjd	adcl	$0,%edx
329211885Spjd	addl	%eax,%ebp
330211885Spjd	movl	4(%esi,%ecx,4),%eax
331211885Spjd	adcl	$0,%edx
332211885Spjd	movl	%ebp,28(%esp,%ecx,4)
333211885Spjd	movl	%edx,%ebp
334211885Spjd	mull	%edi
335211885Spjd	addl	36(%esp,%ecx,4),%ebp
336211885Spjd	leal	2(%ecx),%ecx
337211885Spjd	adcl	$0,%edx
338211885Spjd	addl	%eax,%ebp
339211885Spjd	movl	(%esi,%ecx,4),%eax
340211885Spjd	adcl	$0,%edx
341211885Spjd	cmpl	%ebx,%ecx
342204076Spjd	movl	%ebp,24(%esp,%ecx,4)
343204076Spjd	jl	.L0113rdmadd
344204076Spjd	movl	%edx,%ebp
345204076Spjd	mull	%edi
346204076Spjd	addl	32(%esp,%ebx,4),%ebp
347211885Spjd	adcl	$0,%edx
348204076Spjd	addl	%eax,%ebp
349204076Spjd	adcl	$0,%edx
350204076Spjd	movl	%ebp,28(%esp,%ebx,4)
351211885Spjd	movl	12(%esp),%ecx
352204076Spjd	xorl	%eax,%eax
353204076Spjd	movl	8(%esp),%esi
354211885Spjd	addl	36(%esp,%ebx,4),%edx
355204076Spjd	adcl	40(%esp,%ebx,4),%eax
356204076Spjd	movl	%edx,32(%esp,%ebx,4)
357211885Spjd	cmpl	%ebx,%ecx
358204076Spjd	movl	%eax,36(%esp,%ebx,4)
359211885Spjd	je	.L005common_tail
360211885Spjd	movl	4(%esi,%ecx,4),%edi
361204076Spjd	leal	1(%ecx),%ecx
362211885Spjd	movl	%edi,%eax
363204076Spjd	movl	%ecx,12(%esp)
364204076Spjd	mull	%edi
365204076Spjd	addl	32(%esp,%ecx,4),%eax
366204076Spjd	adcl	$0,%edx
367204076Spjd	movl	%eax,32(%esp,%ecx,4)
368204076Spjd	xorl	%ebp,%ebp
369204076Spjd	cmpl	%ebx,%ecx
370204076Spjd	leal	1(%ecx),%ecx
371204076Spjd	je	.L012sqrlast
372204076Spjd	movl	%edx,%ebx
373211885Spjd	shrl	$1,%edx
374211885Spjd	andl	$1,%ebx
375211885Spjd.align	16
376211885Spjd.L013sqradd:
377204076Spjd	movl	(%esi,%ecx,4),%eax
378204076Spjd	movl	%edx,%ebp
379204076Spjd	mull	%edi
380211885Spjd	addl	%ebp,%eax
381213183Spjd	leal	(%eax,%eax,1),%ebp
382211885Spjd	adcl	$0,%edx
383204076Spjd	shrl	$31,%eax
384204076Spjd	addl	32(%esp,%ecx,4),%ebp
385204076Spjd	leal	1(%ecx),%ecx
386204076Spjd	adcl	$0,%eax
387204076Spjd	addl	%ebx,%ebp
388204076Spjd	adcl	$0,%eax
389211885Spjd	cmpl	(%esp),%ecx
390204076Spjd	movl	%ebp,28(%esp,%ecx,4)
391204076Spjd	movl	%eax,%ebx
392204076Spjd	jle	.L013sqradd
393	movl	%edx,%ebp
394	addl	%edx,%edx
395	shrl	$31,%ebp
396	addl	%ebx,%edx
397	adcl	$0,%ebp
398.L012sqrlast:
399	movl	20(%esp),%edi
400	movl	16(%esp),%esi
401	imull	32(%esp),%edi
402	addl	32(%esp,%ecx,4),%edx
403	movl	(%esi),%eax
404	adcl	$0,%ebp
405	movl	%edx,32(%esp,%ecx,4)
406	movl	%ebp,36(%esp,%ecx,4)
407	mull	%edi
408	addl	32(%esp),%eax
409	leal	-1(%ecx),%ebx
410	adcl	$0,%edx
411	movl	$1,%ecx
412	movl	4(%esi),%eax
413	jmp	.L0113rdmadd
414.align	16
415.L005common_tail:
416	movl	16(%esp),%ebp
417	movl	4(%esp),%edi
418	leal	32(%esp),%esi
419	movl	(%esi),%eax
420	movl	%ebx,%ecx
421	xorl	%edx,%edx
422.align	16
423.L014sub:
424	sbbl	(%ebp,%edx,4),%eax
425	movl	%eax,(%edi,%edx,4)
426	decl	%ecx
427	movl	4(%esi,%edx,4),%eax
428	leal	1(%edx),%edx
429	jge	.L014sub
430	sbbl	$0,%eax
431	andl	%eax,%esi
432	notl	%eax
433	movl	%edi,%ebp
434	andl	%eax,%ebp
435	orl	%ebp,%esi
436.align	16
437.L015copy:
438	movl	(%esi,%ebx,4),%eax
439	movl	%eax,(%edi,%ebx,4)
440	movl	%ecx,32(%esp,%ebx,4)
441	decl	%ebx
442	jge	.L015copy
443	movl	24(%esp),%esp
444	movl	$1,%eax
445.L000just_leave:
446	popl	%edi
447	popl	%esi
448	popl	%ebx
449	popl	%ebp
450	ret
451.size	bn_mul_mont,.-.L_bn_mul_mont_begin
452.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
453.byte	112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
454.byte	54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
455.byte	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
456.byte	111,114,103,62,0
457.comm	OPENSSL_ia32cap_P,8,4
458