1	# $FreeBSD: releng/10.3/secure/lib/libcrypto/i386/x86-mont.s 238405 2012-07-12 19:30:53Z jkim $
2.file	"x86-mont.s"
3.text
4.globl	bn_mul_mont
5.type	bn_mul_mont,@function
6.align	16
7bn_mul_mont:
8.L_bn_mul_mont_begin:
9	pushl	%ebp
10	pushl	%ebx
11	pushl	%esi
12	pushl	%edi
13	xorl	%eax,%eax
14	movl	40(%esp),%edi
15	cmpl	$4,%edi
16	jl	.L000just_leave
17	leal	20(%esp),%esi
18	leal	24(%esp),%edx
19	movl	%esp,%ebp
20	addl	$2,%edi
21	negl	%edi
22	leal	-32(%esp,%edi,4),%esp
23	negl	%edi
24	movl	%esp,%eax
25	subl	%edx,%eax
26	andl	$2047,%eax
27	subl	%eax,%esp
28	xorl	%esp,%edx
29	andl	$2048,%edx
30	xorl	$2048,%edx
31	subl	%edx,%esp
32	andl	$-64,%esp
33	movl	(%esi),%eax
34	movl	4(%esi),%ebx
35	movl	8(%esi),%ecx
36	movl	12(%esi),%edx
37	movl	16(%esi),%esi
38	movl	(%esi),%esi
39	movl	%eax,4(%esp)
40	movl	%ebx,8(%esp)
41	movl	%ecx,12(%esp)
42	movl	%edx,16(%esp)
43	movl	%esi,20(%esp)
44	leal	-3(%edi),%ebx
45	movl	%ebp,24(%esp)
46	leal	OPENSSL_ia32cap_P,%eax
47	btl	$26,(%eax)
48	jnc	.L001non_sse2
49	movl	$-1,%eax
50	movd	%eax,%mm7
51	movl	8(%esp),%esi
52	movl	12(%esp),%edi
53	movl	16(%esp),%ebp
54	xorl	%edx,%edx
55	xorl	%ecx,%ecx
56	movd	(%edi),%mm4
57	movd	(%esi),%mm5
58	movd	(%ebp),%mm3
59	pmuludq	%mm4,%mm5
60	movq	%mm5,%mm2
61	movq	%mm5,%mm0
62	pand	%mm7,%mm0
63	pmuludq	20(%esp),%mm5
64	pmuludq	%mm5,%mm3
65	paddq	%mm0,%mm3
66	movd	4(%ebp),%mm1
67	movd	4(%esi),%mm0
68	psrlq	$32,%mm2
69	psrlq	$32,%mm3
70	incl	%ecx
71.align	16
72.L0021st:
73	pmuludq	%mm4,%mm0
74	pmuludq	%mm5,%mm1
75	paddq	%mm0,%mm2
76	paddq	%mm1,%mm3
77	movq	%mm2,%mm0
78	pand	%mm7,%mm0
79	movd	4(%ebp,%ecx,4),%mm1
80	paddq	%mm0,%mm3
81	movd	4(%esi,%ecx,4),%mm0
82	psrlq	$32,%mm2
83	movd	%mm3,28(%esp,%ecx,4)
84	psrlq	$32,%mm3
85	leal	1(%ecx),%ecx
86	cmpl	%ebx,%ecx
87	jl	.L0021st
88	pmuludq	%mm4,%mm0
89	pmuludq	%mm5,%mm1
90	paddq	%mm0,%mm2
91	paddq	%mm1,%mm3
92	movq	%mm2,%mm0
93	pand	%mm7,%mm0
94	paddq	%mm0,%mm3
95	movd	%mm3,28(%esp,%ecx,4)
96	psrlq	$32,%mm2
97	psrlq	$32,%mm3
98	paddq	%mm2,%mm3
99	movq	%mm3,32(%esp,%ebx,4)
100	incl	%edx
101.L003outer:
102	xorl	%ecx,%ecx
103	movd	(%edi,%edx,4),%mm4
104	movd	(%esi),%mm5
105	movd	32(%esp),%mm6
106	movd	(%ebp),%mm3
107	pmuludq	%mm4,%mm5
108	paddq	%mm6,%mm5
109	movq	%mm5,%mm0
110	movq	%mm5,%mm2
111	pand	%mm7,%mm0
112	pmuludq	20(%esp),%mm5
113	pmuludq	%mm5,%mm3
114	paddq	%mm0,%mm3
115	movd	36(%esp),%mm6
116	movd	4(%ebp),%mm1
117	movd	4(%esi),%mm0
118	psrlq	$32,%mm2
119	psrlq	$32,%mm3
120	paddq	%mm6,%mm2
121	incl	%ecx
122	decl	%ebx
123.L004inner:
124	pmuludq	%mm4,%mm0
125	pmuludq	%mm5,%mm1
126	paddq	%mm0,%mm2
127	paddq	%mm1,%mm3
128	movq	%mm2,%mm0
129	movd	36(%esp,%ecx,4),%mm6
130	pand	%mm7,%mm0
131	movd	4(%ebp,%ecx,4),%mm1
132	paddq	%mm0,%mm3
133	movd	4(%esi,%ecx,4),%mm0
134	psrlq	$32,%mm2
135	movd	%mm3,28(%esp,%ecx,4)
136	psrlq	$32,%mm3
137	paddq	%mm6,%mm2
138	decl	%ebx
139	leal	1(%ecx),%ecx
140	jnz	.L004inner
141	movl	%ecx,%ebx
142	pmuludq	%mm4,%mm0
143	pmuludq	%mm5,%mm1
144	paddq	%mm0,%mm2
145	paddq	%mm1,%mm3
146	movq	%mm2,%mm0
147	pand	%mm7,%mm0
148	paddq	%mm0,%mm3
149	movd	%mm3,28(%esp,%ecx,4)
150	psrlq	$32,%mm2
151	psrlq	$32,%mm3
152	movd	36(%esp,%ebx,4),%mm6
153	paddq	%mm2,%mm3
154	paddq	%mm6,%mm3
155	movq	%mm3,32(%esp,%ebx,4)
156	leal	1(%edx),%edx
157	cmpl	%ebx,%edx
158	jle	.L003outer
159	emms
160	jmp	.L005common_tail
161.align	16
162.L001non_sse2:
163	movl	8(%esp),%esi
164	leal	1(%ebx),%ebp
165	movl	12(%esp),%edi
166	xorl	%ecx,%ecx
167	movl	%esi,%edx
168	andl	$1,%ebp
169	subl	%edi,%edx
170	leal	4(%edi,%ebx,4),%eax
171	orl	%edx,%ebp
172	movl	(%edi),%edi
173	jz	.L006bn_sqr_mont
174	movl	%eax,28(%esp)
175	movl	(%esi),%eax
176	xorl	%edx,%edx
177.align	16
178.L007mull:
179	movl	%edx,%ebp
180	mull	%edi
181	addl	%eax,%ebp
182	leal	1(%ecx),%ecx
183	adcl	$0,%edx
184	movl	(%esi,%ecx,4),%eax
185	cmpl	%ebx,%ecx
186	movl	%ebp,28(%esp,%ecx,4)
187	jl	.L007mull
188	movl	%edx,%ebp
189	mull	%edi
190	movl	20(%esp),%edi
191	addl	%ebp,%eax
192	movl	16(%esp),%esi
193	adcl	$0,%edx
194	imull	32(%esp),%edi
195	movl	%eax,32(%esp,%ebx,4)
196	xorl	%ecx,%ecx
197	movl	%edx,36(%esp,%ebx,4)
198	movl	%ecx,40(%esp,%ebx,4)
199	movl	(%esi),%eax
200	mull	%edi
201	addl	32(%esp),%eax
202	movl	4(%esi),%eax
203	adcl	$0,%edx
204	incl	%ecx
205	jmp	.L0082ndmadd
206.align	16
207.L0091stmadd:
208	movl	%edx,%ebp
209	mull	%edi
210	addl	32(%esp,%ecx,4),%ebp
211	leal	1(%ecx),%ecx
212	adcl	$0,%edx
213	addl	%eax,%ebp
214	movl	(%esi,%ecx,4),%eax
215	adcl	$0,%edx
216	cmpl	%ebx,%ecx
217	movl	%ebp,28(%esp,%ecx,4)
218	jl	.L0091stmadd
219	movl	%edx,%ebp
220	mull	%edi
221	addl	32(%esp,%ebx,4),%eax
222	movl	20(%esp),%edi
223	adcl	$0,%edx
224	movl	16(%esp),%esi
225	addl	%eax,%ebp
226	adcl	$0,%edx
227	imull	32(%esp),%edi
228	xorl	%ecx,%ecx
229	addl	36(%esp,%ebx,4),%edx
230	movl	%ebp,32(%esp,%ebx,4)
231	adcl	$0,%ecx
232	movl	(%esi),%eax
233	movl	%edx,36(%esp,%ebx,4)
234	movl	%ecx,40(%esp,%ebx,4)
235	mull	%edi
236	addl	32(%esp),%eax
237	movl	4(%esi),%eax
238	adcl	$0,%edx
239	movl	$1,%ecx
240.align	16
241.L0082ndmadd:
242	movl	%edx,%ebp
243	mull	%edi
244	addl	32(%esp,%ecx,4),%ebp
245	leal	1(%ecx),%ecx
246	adcl	$0,%edx
247	addl	%eax,%ebp
248	movl	(%esi,%ecx,4),%eax
249	adcl	$0,%edx
250	cmpl	%ebx,%ecx
251	movl	%ebp,24(%esp,%ecx,4)
252	jl	.L0082ndmadd
253	movl	%edx,%ebp
254	mull	%edi
255	addl	32(%esp,%ebx,4),%ebp
256	adcl	$0,%edx
257	addl	%eax,%ebp
258	adcl	$0,%edx
259	movl	%ebp,28(%esp,%ebx,4)
260	xorl	%eax,%eax
261	movl	12(%esp),%ecx
262	addl	36(%esp,%ebx,4),%edx
263	adcl	40(%esp,%ebx,4),%eax
264	leal	4(%ecx),%ecx
265	movl	%edx,32(%esp,%ebx,4)
266	cmpl	28(%esp),%ecx
267	movl	%eax,36(%esp,%ebx,4)
268	je	.L005common_tail
269	movl	(%ecx),%edi
270	movl	8(%esp),%esi
271	movl	%ecx,12(%esp)
272	xorl	%ecx,%ecx
273	xorl	%edx,%edx
274	movl	(%esi),%eax
275	jmp	.L0091stmadd
276.align	16
277.L006bn_sqr_mont:
278	movl	%ebx,(%esp)
279	movl	%ecx,12(%esp)
280	movl	%edi,%eax
281	mull	%edi
282	movl	%eax,32(%esp)
283	movl	%edx,%ebx
284	shrl	$1,%edx
285	andl	$1,%ebx
286	incl	%ecx
287.align	16
288.L010sqr:
289	movl	(%esi,%ecx,4),%eax
290	movl	%edx,%ebp
291	mull	%edi
292	addl	%ebp,%eax
293	leal	1(%ecx),%ecx
294	adcl	$0,%edx
295	leal	(%ebx,%eax,2),%ebp
296	shrl	$31,%eax
297	cmpl	(%esp),%ecx
298	movl	%eax,%ebx
299	movl	%ebp,28(%esp,%ecx,4)
300	jl	.L010sqr
301	movl	(%esi,%ecx,4),%eax
302	movl	%edx,%ebp
303	mull	%edi
304	addl	%ebp,%eax
305	movl	20(%esp),%edi
306	adcl	$0,%edx
307	movl	16(%esp),%esi
308	leal	(%ebx,%eax,2),%ebp
309	imull	32(%esp),%edi
310	shrl	$31,%eax
311	movl	%ebp,32(%esp,%ecx,4)
312	leal	(%eax,%edx,2),%ebp
313	movl	(%esi),%eax
314	shrl	$31,%edx
315	movl	%ebp,36(%esp,%ecx,4)
316	movl	%edx,40(%esp,%ecx,4)
317	mull	%edi
318	addl	32(%esp),%eax
319	movl	%ecx,%ebx
320	adcl	$0,%edx
321	movl	4(%esi),%eax
322	movl	$1,%ecx
323.align	16
324.L0113rdmadd:
325	movl	%edx,%ebp
326	mull	%edi
327	addl	32(%esp,%ecx,4),%ebp
328	adcl	$0,%edx
329	addl	%eax,%ebp
330	movl	4(%esi,%ecx,4),%eax
331	adcl	$0,%edx
332	movl	%ebp,28(%esp,%ecx,4)
333	movl	%edx,%ebp
334	mull	%edi
335	addl	36(%esp,%ecx,4),%ebp
336	leal	2(%ecx),%ecx
337	adcl	$0,%edx
338	addl	%eax,%ebp
339	movl	(%esi,%ecx,4),%eax
340	adcl	$0,%edx
341	cmpl	%ebx,%ecx
342	movl	%ebp,24(%esp,%ecx,4)
343	jl	.L0113rdmadd
344	movl	%edx,%ebp
345	mull	%edi
346	addl	32(%esp,%ebx,4),%ebp
347	adcl	$0,%edx
348	addl	%eax,%ebp
349	adcl	$0,%edx
350	movl	%ebp,28(%esp,%ebx,4)
351	movl	12(%esp),%ecx
352	xorl	%eax,%eax
353	movl	8(%esp),%esi
354	addl	36(%esp,%ebx,4),%edx
355	adcl	40(%esp,%ebx,4),%eax
356	movl	%edx,32(%esp,%ebx,4)
357	cmpl	%ebx,%ecx
358	movl	%eax,36(%esp,%ebx,4)
359	je	.L005common_tail
360	movl	4(%esi,%ecx,4),%edi
361	leal	1(%ecx),%ecx
362	movl	%edi,%eax
363	movl	%ecx,12(%esp)
364	mull	%edi
365	addl	32(%esp,%ecx,4),%eax
366	adcl	$0,%edx
367	movl	%eax,32(%esp,%ecx,4)
368	xorl	%ebp,%ebp
369	cmpl	%ebx,%ecx
370	leal	1(%ecx),%ecx
371	je	.L012sqrlast
372	movl	%edx,%ebx
373	shrl	$1,%edx
374	andl	$1,%ebx
375.align	16
376.L013sqradd:
377	movl	(%esi,%ecx,4),%eax
378	movl	%edx,%ebp
379	mull	%edi
380	addl	%ebp,%eax
381	leal	(%eax,%eax,1),%ebp
382	adcl	$0,%edx
383	shrl	$31,%eax
384	addl	32(%esp,%ecx,4),%ebp
385	leal	1(%ecx),%ecx
386	adcl	$0,%eax
387	addl	%ebx,%ebp
388	adcl	$0,%eax
389	cmpl	(%esp),%ecx
390	movl	%ebp,28(%esp,%ecx,4)
391	movl	%eax,%ebx
392	jle	.L013sqradd
393	movl	%edx,%ebp
394	addl	%edx,%edx
395	shrl	$31,%ebp
396	addl	%ebx,%edx
397	adcl	$0,%ebp
398.L012sqrlast:
399	movl	20(%esp),%edi
400	movl	16(%esp),%esi
401	imull	32(%esp),%edi
402	addl	32(%esp,%ecx,4),%edx
403	movl	(%esi),%eax
404	adcl	$0,%ebp
405	movl	%edx,32(%esp,%ecx,4)
406	movl	%ebp,36(%esp,%ecx,4)
407	mull	%edi
408	addl	32(%esp),%eax
409	leal	-1(%ecx),%ebx
410	adcl	$0,%edx
411	movl	$1,%ecx
412	movl	4(%esi),%eax
413	jmp	.L0113rdmadd
414.align	16
415.L005common_tail:
416	movl	16(%esp),%ebp
417	movl	4(%esp),%edi
418	leal	32(%esp),%esi
419	movl	(%esi),%eax
420	movl	%ebx,%ecx
421	xorl	%edx,%edx
422.align	16
423.L014sub:
424	sbbl	(%ebp,%edx,4),%eax
425	movl	%eax,(%edi,%edx,4)
426	decl	%ecx
427	movl	4(%esi,%edx,4),%eax
428	leal	1(%edx),%edx
429	jge	.L014sub
430	sbbl	$0,%eax
431	andl	%eax,%esi
432	notl	%eax
433	movl	%edi,%ebp
434	andl	%eax,%ebp
435	orl	%ebp,%esi
436.align	16
437.L015copy:
438	movl	(%esi,%ebx,4),%eax
439	movl	%eax,(%edi,%ebx,4)
440	movl	%ecx,32(%esp,%ebx,4)
441	decl	%ebx
442	jge	.L015copy
443	movl	24(%esp),%esp
444	movl	$1,%eax
445.L000just_leave:
446	popl	%edi
447	popl	%esi
448	popl	%ebx
449	popl	%ebp
450	ret
451.size	bn_mul_mont,.-.L_bn_mul_mont_begin
452.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
453.byte	112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
454.byte	54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
455.byte	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
456.byte	111,114,103,62,0
457.comm	OPENSSL_ia32cap_P,8,4
458