x86-mont.S revision 298998
1	# $FreeBSD: head/secure/lib/libcrypto/i386/x86-mont.s 298998 2016-05-03 18:50:10Z jkim $
2.file	"x86-mont.s"
3.text
4.globl	bn_mul_mont
5.type	bn_mul_mont,@function
6.align	16
7bn_mul_mont:
8.L_bn_mul_mont_begin:
9	pushl	%ebp
10	pushl	%ebx
11	pushl	%esi
12	pushl	%edi
13	xorl	%eax,%eax
14	movl	40(%esp),%edi
15	cmpl	$4,%edi
16	jl	.L000just_leave
17	leal	20(%esp),%esi
18	leal	24(%esp),%edx
19	movl	%esp,%ebp
20	addl	$2,%edi
21	negl	%edi
22	leal	-32(%esp,%edi,4),%esp
23	negl	%edi
24	movl	%esp,%eax
25	subl	%edx,%eax
26	andl	$2047,%eax
27	subl	%eax,%esp
28	xorl	%esp,%edx
29	andl	$2048,%edx
30	xorl	$2048,%edx
31	subl	%edx,%esp
32	andl	$-64,%esp
33	movl	%ebp,%eax
34	subl	%esp,%eax
35	andl	$-4096,%eax
36.L001page_walk:
37	movl	(%esp,%eax,1),%edx
38	subl	$4096,%eax
39.byte	46
40	jnc	.L001page_walk
41	movl	(%esi),%eax
42	movl	4(%esi),%ebx
43	movl	8(%esi),%ecx
44	movl	12(%esi),%edx
45	movl	16(%esi),%esi
46	movl	(%esi),%esi
47	movl	%eax,4(%esp)
48	movl	%ebx,8(%esp)
49	movl	%ecx,12(%esp)
50	movl	%edx,16(%esp)
51	movl	%esi,20(%esp)
52	leal	-3(%edi),%ebx
53	movl	%ebp,24(%esp)
54	leal	OPENSSL_ia32cap_P,%eax
55	btl	$26,(%eax)
56	jnc	.L002non_sse2
57	movl	$-1,%eax
58	movd	%eax,%mm7
59	movl	8(%esp),%esi
60	movl	12(%esp),%edi
61	movl	16(%esp),%ebp
62	xorl	%edx,%edx
63	xorl	%ecx,%ecx
64	movd	(%edi),%mm4
65	movd	(%esi),%mm5
66	movd	(%ebp),%mm3
67	pmuludq	%mm4,%mm5
68	movq	%mm5,%mm2
69	movq	%mm5,%mm0
70	pand	%mm7,%mm0
71	pmuludq	20(%esp),%mm5
72	pmuludq	%mm5,%mm3
73	paddq	%mm0,%mm3
74	movd	4(%ebp),%mm1
75	movd	4(%esi),%mm0
76	psrlq	$32,%mm2
77	psrlq	$32,%mm3
78	incl	%ecx
79.align	16
80.L0031st:
81	pmuludq	%mm4,%mm0
82	pmuludq	%mm5,%mm1
83	paddq	%mm0,%mm2
84	paddq	%mm1,%mm3
85	movq	%mm2,%mm0
86	pand	%mm7,%mm0
87	movd	4(%ebp,%ecx,4),%mm1
88	paddq	%mm0,%mm3
89	movd	4(%esi,%ecx,4),%mm0
90	psrlq	$32,%mm2
91	movd	%mm3,28(%esp,%ecx,4)
92	psrlq	$32,%mm3
93	leal	1(%ecx),%ecx
94	cmpl	%ebx,%ecx
95	jl	.L0031st
96	pmuludq	%mm4,%mm0
97	pmuludq	%mm5,%mm1
98	paddq	%mm0,%mm2
99	paddq	%mm1,%mm3
100	movq	%mm2,%mm0
101	pand	%mm7,%mm0
102	paddq	%mm0,%mm3
103	movd	%mm3,28(%esp,%ecx,4)
104	psrlq	$32,%mm2
105	psrlq	$32,%mm3
106	paddq	%mm2,%mm3
107	movq	%mm3,32(%esp,%ebx,4)
108	incl	%edx
109.L004outer:
110	xorl	%ecx,%ecx
111	movd	(%edi,%edx,4),%mm4
112	movd	(%esi),%mm5
113	movd	32(%esp),%mm6
114	movd	(%ebp),%mm3
115	pmuludq	%mm4,%mm5
116	paddq	%mm6,%mm5
117	movq	%mm5,%mm0
118	movq	%mm5,%mm2
119	pand	%mm7,%mm0
120	pmuludq	20(%esp),%mm5
121	pmuludq	%mm5,%mm3
122	paddq	%mm0,%mm3
123	movd	36(%esp),%mm6
124	movd	4(%ebp),%mm1
125	movd	4(%esi),%mm0
126	psrlq	$32,%mm2
127	psrlq	$32,%mm3
128	paddq	%mm6,%mm2
129	incl	%ecx
130	decl	%ebx
131.L005inner:
132	pmuludq	%mm4,%mm0
133	pmuludq	%mm5,%mm1
134	paddq	%mm0,%mm2
135	paddq	%mm1,%mm3
136	movq	%mm2,%mm0
137	movd	36(%esp,%ecx,4),%mm6
138	pand	%mm7,%mm0
139	movd	4(%ebp,%ecx,4),%mm1
140	paddq	%mm0,%mm3
141	movd	4(%esi,%ecx,4),%mm0
142	psrlq	$32,%mm2
143	movd	%mm3,28(%esp,%ecx,4)
144	psrlq	$32,%mm3
145	paddq	%mm6,%mm2
146	decl	%ebx
147	leal	1(%ecx),%ecx
148	jnz	.L005inner
149	movl	%ecx,%ebx
150	pmuludq	%mm4,%mm0
151	pmuludq	%mm5,%mm1
152	paddq	%mm0,%mm2
153	paddq	%mm1,%mm3
154	movq	%mm2,%mm0
155	pand	%mm7,%mm0
156	paddq	%mm0,%mm3
157	movd	%mm3,28(%esp,%ecx,4)
158	psrlq	$32,%mm2
159	psrlq	$32,%mm3
160	movd	36(%esp,%ebx,4),%mm6
161	paddq	%mm2,%mm3
162	paddq	%mm6,%mm3
163	movq	%mm3,32(%esp,%ebx,4)
164	leal	1(%edx),%edx
165	cmpl	%ebx,%edx
166	jle	.L004outer
167	emms
168	jmp	.L006common_tail
169.align	16
170.L002non_sse2:
171	movl	8(%esp),%esi
172	leal	1(%ebx),%ebp
173	movl	12(%esp),%edi
174	xorl	%ecx,%ecx
175	movl	%esi,%edx
176	andl	$1,%ebp
177	subl	%edi,%edx
178	leal	4(%edi,%ebx,4),%eax
179	orl	%edx,%ebp
180	movl	(%edi),%edi
181	jz	.L007bn_sqr_mont
182	movl	%eax,28(%esp)
183	movl	(%esi),%eax
184	xorl	%edx,%edx
185.align	16
186.L008mull:
187	movl	%edx,%ebp
188	mull	%edi
189	addl	%eax,%ebp
190	leal	1(%ecx),%ecx
191	adcl	$0,%edx
192	movl	(%esi,%ecx,4),%eax
193	cmpl	%ebx,%ecx
194	movl	%ebp,28(%esp,%ecx,4)
195	jl	.L008mull
196	movl	%edx,%ebp
197	mull	%edi
198	movl	20(%esp),%edi
199	addl	%ebp,%eax
200	movl	16(%esp),%esi
201	adcl	$0,%edx
202	imull	32(%esp),%edi
203	movl	%eax,32(%esp,%ebx,4)
204	xorl	%ecx,%ecx
205	movl	%edx,36(%esp,%ebx,4)
206	movl	%ecx,40(%esp,%ebx,4)
207	movl	(%esi),%eax
208	mull	%edi
209	addl	32(%esp),%eax
210	movl	4(%esi),%eax
211	adcl	$0,%edx
212	incl	%ecx
213	jmp	.L0092ndmadd
214.align	16
215.L0101stmadd:
216	movl	%edx,%ebp
217	mull	%edi
218	addl	32(%esp,%ecx,4),%ebp
219	leal	1(%ecx),%ecx
220	adcl	$0,%edx
221	addl	%eax,%ebp
222	movl	(%esi,%ecx,4),%eax
223	adcl	$0,%edx
224	cmpl	%ebx,%ecx
225	movl	%ebp,28(%esp,%ecx,4)
226	jl	.L0101stmadd
227	movl	%edx,%ebp
228	mull	%edi
229	addl	32(%esp,%ebx,4),%eax
230	movl	20(%esp),%edi
231	adcl	$0,%edx
232	movl	16(%esp),%esi
233	addl	%eax,%ebp
234	adcl	$0,%edx
235	imull	32(%esp),%edi
236	xorl	%ecx,%ecx
237	addl	36(%esp,%ebx,4),%edx
238	movl	%ebp,32(%esp,%ebx,4)
239	adcl	$0,%ecx
240	movl	(%esi),%eax
241	movl	%edx,36(%esp,%ebx,4)
242	movl	%ecx,40(%esp,%ebx,4)
243	mull	%edi
244	addl	32(%esp),%eax
245	movl	4(%esi),%eax
246	adcl	$0,%edx
247	movl	$1,%ecx
248.align	16
249.L0092ndmadd:
250	movl	%edx,%ebp
251	mull	%edi
252	addl	32(%esp,%ecx,4),%ebp
253	leal	1(%ecx),%ecx
254	adcl	$0,%edx
255	addl	%eax,%ebp
256	movl	(%esi,%ecx,4),%eax
257	adcl	$0,%edx
258	cmpl	%ebx,%ecx
259	movl	%ebp,24(%esp,%ecx,4)
260	jl	.L0092ndmadd
261	movl	%edx,%ebp
262	mull	%edi
263	addl	32(%esp,%ebx,4),%ebp
264	adcl	$0,%edx
265	addl	%eax,%ebp
266	adcl	$0,%edx
267	movl	%ebp,28(%esp,%ebx,4)
268	xorl	%eax,%eax
269	movl	12(%esp),%ecx
270	addl	36(%esp,%ebx,4),%edx
271	adcl	40(%esp,%ebx,4),%eax
272	leal	4(%ecx),%ecx
273	movl	%edx,32(%esp,%ebx,4)
274	cmpl	28(%esp),%ecx
275	movl	%eax,36(%esp,%ebx,4)
276	je	.L006common_tail
277	movl	(%ecx),%edi
278	movl	8(%esp),%esi
279	movl	%ecx,12(%esp)
280	xorl	%ecx,%ecx
281	xorl	%edx,%edx
282	movl	(%esi),%eax
283	jmp	.L0101stmadd
284.align	16
285.L007bn_sqr_mont:
286	movl	%ebx,(%esp)
287	movl	%ecx,12(%esp)
288	movl	%edi,%eax
289	mull	%edi
290	movl	%eax,32(%esp)
291	movl	%edx,%ebx
292	shrl	$1,%edx
293	andl	$1,%ebx
294	incl	%ecx
295.align	16
296.L011sqr:
297	movl	(%esi,%ecx,4),%eax
298	movl	%edx,%ebp
299	mull	%edi
300	addl	%ebp,%eax
301	leal	1(%ecx),%ecx
302	adcl	$0,%edx
303	leal	(%ebx,%eax,2),%ebp
304	shrl	$31,%eax
305	cmpl	(%esp),%ecx
306	movl	%eax,%ebx
307	movl	%ebp,28(%esp,%ecx,4)
308	jl	.L011sqr
309	movl	(%esi,%ecx,4),%eax
310	movl	%edx,%ebp
311	mull	%edi
312	addl	%ebp,%eax
313	movl	20(%esp),%edi
314	adcl	$0,%edx
315	movl	16(%esp),%esi
316	leal	(%ebx,%eax,2),%ebp
317	imull	32(%esp),%edi
318	shrl	$31,%eax
319	movl	%ebp,32(%esp,%ecx,4)
320	leal	(%eax,%edx,2),%ebp
321	movl	(%esi),%eax
322	shrl	$31,%edx
323	movl	%ebp,36(%esp,%ecx,4)
324	movl	%edx,40(%esp,%ecx,4)
325	mull	%edi
326	addl	32(%esp),%eax
327	movl	%ecx,%ebx
328	adcl	$0,%edx
329	movl	4(%esi),%eax
330	movl	$1,%ecx
331.align	16
332.L0123rdmadd:
333	movl	%edx,%ebp
334	mull	%edi
335	addl	32(%esp,%ecx,4),%ebp
336	adcl	$0,%edx
337	addl	%eax,%ebp
338	movl	4(%esi,%ecx,4),%eax
339	adcl	$0,%edx
340	movl	%ebp,28(%esp,%ecx,4)
341	movl	%edx,%ebp
342	mull	%edi
343	addl	36(%esp,%ecx,4),%ebp
344	leal	2(%ecx),%ecx
345	adcl	$0,%edx
346	addl	%eax,%ebp
347	movl	(%esi,%ecx,4),%eax
348	adcl	$0,%edx
349	cmpl	%ebx,%ecx
350	movl	%ebp,24(%esp,%ecx,4)
351	jl	.L0123rdmadd
352	movl	%edx,%ebp
353	mull	%edi
354	addl	32(%esp,%ebx,4),%ebp
355	adcl	$0,%edx
356	addl	%eax,%ebp
357	adcl	$0,%edx
358	movl	%ebp,28(%esp,%ebx,4)
359	movl	12(%esp),%ecx
360	xorl	%eax,%eax
361	movl	8(%esp),%esi
362	addl	36(%esp,%ebx,4),%edx
363	adcl	40(%esp,%ebx,4),%eax
364	movl	%edx,32(%esp,%ebx,4)
365	cmpl	%ebx,%ecx
366	movl	%eax,36(%esp,%ebx,4)
367	je	.L006common_tail
368	movl	4(%esi,%ecx,4),%edi
369	leal	1(%ecx),%ecx
370	movl	%edi,%eax
371	movl	%ecx,12(%esp)
372	mull	%edi
373	addl	32(%esp,%ecx,4),%eax
374	adcl	$0,%edx
375	movl	%eax,32(%esp,%ecx,4)
376	xorl	%ebp,%ebp
377	cmpl	%ebx,%ecx
378	leal	1(%ecx),%ecx
379	je	.L013sqrlast
380	movl	%edx,%ebx
381	shrl	$1,%edx
382	andl	$1,%ebx
383.align	16
384.L014sqradd:
385	movl	(%esi,%ecx,4),%eax
386	movl	%edx,%ebp
387	mull	%edi
388	addl	%ebp,%eax
389	leal	(%eax,%eax,1),%ebp
390	adcl	$0,%edx
391	shrl	$31,%eax
392	addl	32(%esp,%ecx,4),%ebp
393	leal	1(%ecx),%ecx
394	adcl	$0,%eax
395	addl	%ebx,%ebp
396	adcl	$0,%eax
397	cmpl	(%esp),%ecx
398	movl	%ebp,28(%esp,%ecx,4)
399	movl	%eax,%ebx
400	jle	.L014sqradd
401	movl	%edx,%ebp
402	addl	%edx,%edx
403	shrl	$31,%ebp
404	addl	%ebx,%edx
405	adcl	$0,%ebp
406.L013sqrlast:
407	movl	20(%esp),%edi
408	movl	16(%esp),%esi
409	imull	32(%esp),%edi
410	addl	32(%esp,%ecx,4),%edx
411	movl	(%esi),%eax
412	adcl	$0,%ebp
413	movl	%edx,32(%esp,%ecx,4)
414	movl	%ebp,36(%esp,%ecx,4)
415	mull	%edi
416	addl	32(%esp),%eax
417	leal	-1(%ecx),%ebx
418	adcl	$0,%edx
419	movl	$1,%ecx
420	movl	4(%esi),%eax
421	jmp	.L0123rdmadd
422.align	16
423.L006common_tail:
424	movl	16(%esp),%ebp
425	movl	4(%esp),%edi
426	leal	32(%esp),%esi
427	movl	(%esi),%eax
428	movl	%ebx,%ecx
429	xorl	%edx,%edx
430.align	16
431.L015sub:
432	sbbl	(%ebp,%edx,4),%eax
433	movl	%eax,(%edi,%edx,4)
434	decl	%ecx
435	movl	4(%esi,%edx,4),%eax
436	leal	1(%edx),%edx
437	jge	.L015sub
438	sbbl	$0,%eax
439	andl	%eax,%esi
440	notl	%eax
441	movl	%edi,%ebp
442	andl	%eax,%ebp
443	orl	%ebp,%esi
444.align	16
445.L016copy:
446	movl	(%esi,%ebx,4),%eax
447	movl	%eax,(%edi,%ebx,4)
448	movl	%ecx,32(%esp,%ebx,4)
449	decl	%ebx
450	jge	.L016copy
451	movl	24(%esp),%esp
452	movl	$1,%eax
453.L000just_leave:
454	popl	%edi
455	popl	%esi
456	popl	%ebx
457	popl	%ebp
458	ret
459.size	bn_mul_mont,.-.L_bn_mul_mont_begin
460.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
461.byte	112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
462.byte	54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
463.byte	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
464.byte	111,114,103,62,0
465.comm	OPENSSL_ia32cap_P,16,4
466