1dnl  AMD64 mpn_redc_1 -- Montgomery reduction with a one-limb modular inverse.
2
3dnl  Copyright 2004, 2008 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C	     cycles/limb
24C	     cycles/limb
25C K8,K9:	 2.5
26C K10:		 2.5
27C P4:		 ?
28C P6-15 (Core2): 5.3
29C P6-28 (Atom):	 ?
30
31C TODO
32C  * Handle certain sizes, e.g., 1, 2, 3, 4, 8, with single-loop code.
33C    The code for 1, 2, 3, 4 should perhaps be completely register based.
34C  * Perhaps align outer loops.
35C  * The sub_n at the end leaks side-channel data.  How do we fix that?
36C  * Write mpn_add_n_sub_n computing R = A + B - C.  It should run at 2 c/l.
37C  * We could software pipeline the IMUL stuff, by putting it before the
38C    outer loops and before the end of the outer loops.  The last outer
39C    loop iteration would then compute an unneeded product, but it is at
40C    least not a stray read from up[], since it is at up[n].
41C  * Can we combine both the add_n and sub_n into the loops, somehow?
42
43C INPUT PARAMETERS
44define(`rp',	  `%rdi')
45define(`up',	  `%rsi')
46define(`param_mp',`%rdx')
47define(`n',	  `%rcx')
48define(`invm',	  `%r8')
49
50define(`mp',	  `%r13')
51define(`i',	  `%r11')
52define(`nneg',	  `%r12')
53
54ASM_START()
55	TEXT
56	ALIGN(32)
57PROLOGUE(mpn_redc_1)
58	push	%rbp
59	push	%rbx
60	push	%r12
61	push	%r13
62	push	%r14
63	push	n
64	sub	$8, %rsp		C maintain ABI required rsp alignment
65
66	lea	(param_mp,n,8), mp	C mp += n
67	lea	(up,n,8), up		C up += n
68
69	mov	n, nneg
70	neg	nneg
71
72	mov	R32(n), R32(%rax)
73	and	$3, R32(%rax)
74	jz	L(b0)
75	cmp	$2, R32(%rax)
76	jz	L(b2)
77	jg	L(b3)
78
79L(b1):	C lea	(mp), mp
80	lea	-16(up), up
81L(o1):	mov	nneg, i
82	mov	16(up,nneg,8), %rbp	C up[0]
83	imul	invm, %rbp
84
85	mov	(mp,i,8), %rax
86	xor	%ebx, %ebx
87	mul	%rbp
88	add	$1, i
89	jnz	1f
90	add	%rax, 8(up,i,8)
91	adc	$0, %rdx
92	mov	%rdx, %r14
93	jmp	L(n1)
94
951:	mov	%rax, %r9
96	mov	(mp,i,8), %rax
97	mov	%rdx, %r14
98	jmp	L(mi1)
99
100	ALIGN(16)
101L(lo1):	add	%r10, (up,i,8)
102	adc	%rax, %r9
103	mov	(mp,i,8), %rax
104	adc	%rdx, %r14
105L(mi1):	xor	%r10d, %r10d
106	mul	%rbp
107	add	%r9, 8(up,i,8)
108	adc	%rax, %r14
109	adc	%rdx, %rbx
110	mov	8(mp,i,8), %rax
111	mul	%rbp
112	add	%r14, 16(up,i,8)
113	adc	%rax, %rbx
114	adc	%rdx, %r10
115	mov	16(mp,i,8), %rax
116	mul	%rbp
117	xor	%r9d, %r9d
118	xor	%r14d, %r14d
119	add	%rbx, 24(up,i,8)
120	adc	%rax, %r10
121	mov	24(mp,i,8), %rax
122	adc	%rdx, %r9
123	xor	%ebx, %ebx
124	mul	%rbp
125	add	$4, i
126	js	L(lo1)
127L(ed1):	add	%r10, (up)
128	adc	%rax, %r9
129	adc	%rdx, %r14
130	xor	%r10d, %r10d
131	add	%r9, 8(up)
132	adc	$0, %r14
133L(n1):	mov	%r14, 16(up,nneg,8)	C up[0]
134	add	$8, up
135	dec	n
136	jnz	L(o1)
137C	lea	(mp), mp
138	lea	16(up), up
139	jmp	L(common)
140
141L(b0):	C lea	(mp), mp
142	lea	-16(up), up
143L(o0):	mov	nneg, i
144	mov	16(up,nneg,8), %rbp	C up[0]
145	imul	invm, %rbp
146
147	mov	(mp,i,8), %rax
148	xor	%r10d, %r10d
149	mul	%rbp
150	mov	%rax, %r14
151	mov	%rdx, %rbx
152	jmp	L(mi0)
153
154	ALIGN(16)
155L(lo0):	add	%r10, (up,i,8)
156	adc	%rax, %r9
157	mov	(mp,i,8), %rax
158	adc	%rdx, %r14
159	xor	%r10d, %r10d
160	mul	%rbp
161	add	%r9, 8(up,i,8)
162	adc	%rax, %r14
163	adc	%rdx, %rbx
164L(mi0):	mov	8(mp,i,8), %rax
165	mul	%rbp
166	add	%r14, 16(up,i,8)
167	adc	%rax, %rbx
168	adc	%rdx, %r10
169	mov	16(mp,i,8), %rax
170	mul	%rbp
171	xor	%r9d, %r9d
172	xor	%r14d, %r14d
173	add	%rbx, 24(up,i,8)
174	adc	%rax, %r10
175	mov	24(mp,i,8), %rax
176	adc	%rdx, %r9
177	xor	%ebx, %ebx
178	mul	%rbp
179	add	$4, i
180	js	L(lo0)
181L(ed0):	add	%r10, (up)
182	adc	%rax, %r9
183	adc	%rdx, %r14
184	xor	%r10d, %r10d
185	add	%r9, 8(up)
186	adc	$0, %r14
187	mov	%r14, 16(up,nneg,8)	C up[0]
188	add	$8, up
189	dec	n
190	jnz	L(o0)
191C	lea	(mp), mp
192	lea	16(up), up
193	jmp	L(common)
194
195
196L(b3):	lea	-8(mp), mp
197	lea	-24(up), up
198L(o3):	mov	nneg, i
199	mov	24(up,nneg,8), %rbp	C up[0]
200	imul	invm, %rbp
201
202	mov	8(mp,i,8), %rax
203	mul	%rbp
204	mov	%rax, %rbx
205	mov	%rdx, %r10
206	jmp	L(mi3)
207
208	ALIGN(16)
209L(lo3):	add	%r10, (up,i,8)
210	adc	%rax, %r9
211	mov	(mp,i,8), %rax
212	adc	%rdx, %r14
213	xor	%r10d, %r10d
214	mul	%rbp
215	add	%r9, 8(up,i,8)
216	adc	%rax, %r14
217	adc	%rdx, %rbx
218	mov	8(mp,i,8), %rax
219	mul	%rbp
220	add	%r14, 16(up,i,8)
221	adc	%rax, %rbx
222	adc	%rdx, %r10
223L(mi3):	mov	16(mp,i,8), %rax
224	mul	%rbp
225	xor	%r9d, %r9d
226	xor	%r14d, %r14d
227	add	%rbx, 24(up,i,8)
228	adc	%rax, %r10
229	mov	24(mp,i,8), %rax
230	adc	%rdx, %r9
231	xor	%ebx, %ebx
232	mul	%rbp
233	add	$4, i
234	js	L(lo3)
235L(ed3):	add	%r10, 8(up)
236	adc	%rax, %r9
237	adc	%rdx, %r14
238	xor	%r10d, %r10d
239	add	%r9, 16(up)
240	adc	$0, %r14
241	mov	%r14, 24(up,nneg,8)	C up[0]
242	add	$8, up
243	dec	n
244	jnz	L(o3)
245	lea	8(mp), mp
246	lea	24(up), up
247	jmp	L(common)
248
249L(b2):	lea	-16(mp), mp
250	lea	-32(up), up
251L(o2):	mov	nneg, i
252	mov	32(up,nneg,8), %rbp	C up[0]
253	imul	invm, %rbp
254
255	mov	16(mp,i,8), %rax
256	mul	%rbp
257	xor	%r14d, %r14d
258	mov	%rax, %r10
259	mov	24(mp,i,8), %rax
260	mov	%rdx, %r9
261	jmp	L(mi2)
262
263	ALIGN(16)
264L(lo2):	add	%r10, (up,i,8)
265	adc	%rax, %r9
266	mov	(mp,i,8), %rax
267	adc	%rdx, %r14
268	xor	%r10d, %r10d
269	mul	%rbp
270	add	%r9, 8(up,i,8)
271	adc	%rax, %r14
272	adc	%rdx, %rbx
273	mov	8(mp,i,8), %rax
274	mul	%rbp
275	add	%r14, 16(up,i,8)
276	adc	%rax, %rbx
277	adc	%rdx, %r10
278	mov	16(mp,i,8), %rax
279	mul	%rbp
280	xor	%r9d, %r9d
281	xor	%r14d, %r14d
282	add	%rbx, 24(up,i,8)
283	adc	%rax, %r10
284	mov	24(mp,i,8), %rax
285	adc	%rdx, %r9
286L(mi2):	xor	%ebx, %ebx
287	mul	%rbp
288	add	$4, i
289	js	L(lo2)
290L(ed2):	add	%r10, 16(up)
291	adc	%rax, %r9
292	adc	%rdx, %r14
293	xor	%r10d, %r10d
294	add	%r9, 24(up)
295	adc	$0, %r14
296	mov	%r14, 32(up,nneg,8)	C up[0]
297	add	$8, up
298	dec	n
299	jnz	L(o2)
300	lea	16(mp), mp
301	lea	32(up), up
302
303
304L(common):
305	lea	(mp,nneg,8), mp		C restore entry mp
306
307C   cy = mpn_add_n (rp, up, up - n, n);
308C		    rdi rsi  rdx    rcx
309	lea	(up,nneg,8), up		C up -= n
310	lea	(up,nneg,8), %rdx	C rdx = up - n [up entry value]
311	mov	rp, nneg		C preserve rp over first call
312	mov	8(%rsp), %rcx		C pass entry n
313C	mov	rp, %rdi
314	CALL(	mpn_add_n)
315	test	R32(%rax), R32(%rax)
316	jz	L(ret)
317
318C     mpn_sub_n (rp, rp, mp, n);
319C		 rdi rsi rdx rcx
320	mov	nneg, %rdi
321	mov	nneg, %rsi
322	mov	mp, %rdx
323	mov	8(%rsp), %rcx		C pass entry n
324	CALL(	mpn_sub_n)
325
326L(ret):
327	add	$8, %rsp
328	pop	n			C just increment rsp
329	pop	%r14
330	pop	%r13
331	pop	%r12
332	pop	%rbx
333	pop	%rbp
334	ret
335EPILOGUE()
336