1dnl  X86-64 mpn_redc_1 optimised for Intel Conroe and Wolfdale.
2
3dnl  Contributed to the GNU project by Torbj��rn Granlund.
4
5dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb
36C AMD K8,K9	 ?
37C AMD K10	 ?
38C AMD bull	 ?
39C AMD pile	 ?
40C AMD steam	 ?
41C AMD bobcat	 ?
42C AMD jaguar	 ?
43C Intel P4	 ?
44C Intel core	 4.5  (fluctuating)
45C Intel NHM	 ?
46C Intel SBR	 ?
47C Intel IBR	 ?
48C Intel HWL	 ?
49C Intel BWL	 ?
50C Intel atom	 ?
51C VIA nano	 ?
52
53C The inner loops of this code are the result of running a code generation and
54C optimisation tool suite written by David Harvey and Torbj��rn Granlund.
55
56C TODO
57C  * Micro-optimise, none performed thus far.
58C  * Consider inlining mpn_add_n.
59C  * Single basecases out before the pushes.
60C  * Keep up[i] in registers for basecases (might require pushes).
61
62C When playing with pointers, set this to $2 to fall back to conservative
63C indexing in wind-down code.
64define(`I',`$1')
65
66define(`rp',          `%rdi')   C rcx
67define(`up',          `%rsi')   C rdx
68define(`mp_param',    `%rdx')   C r8
69define(`n',           `%rcx')   C r9
70define(`u0inv',       `%r8')    C stack
71
72define(`i',           `%r14')
73define(`j',           `%r15')
74define(`mp',          `%r12')
75define(`q0',          `%r13')
76
77C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
78C  X  q0'  n   X  rp  up      u0i           mp   q0 i   j
79
80ABI_SUPPORT(DOS64)
81ABI_SUPPORT(STD64)
82
83define(`ALIGNx', `ALIGN(16)')
84
85ASM_START()
86	TEXT
87	ALIGN(32)
88PROLOGUE(mpn_redc_1)
89	FUNC_ENTRY(4)
90IFDOS(`	mov	56(%rsp), %r8	')
91	push	%rbx
92	push	%rbp
93	push	%r12
94	push	%r13
95	push	%r14
96	push	%r15
97
98	mov	(up), q0
99	mov	n, j			C outer loop induction var
100	lea	(mp_param,n,8), mp
101	lea	-16(up,n,8), up
102	neg	n
103	imul	u0inv, q0		C first iteration q0
104
105	test	$1, R8(n)
106	jz	L(b0)
107
108L(b1):	cmp	$-1, R32(n)
109	jz	L(n1)
110	cmp	$-3, R32(n)
111	jz	L(n3)
112
113	push	rp
114
115L(otp1):lea	3(n), i
116	mov	(mp,n,8), %rax
117	mul	q0
118	lea	(%rax), %rbp
119	mov	8(mp,n,8), %rax
120	lea	(%rdx), %r9
121	mul	q0
122	lea	(%rax), %r11
123	mov	16(mp,n,8), %rax
124	mov	16(up,n,8), %r10
125	lea	(%rdx), %rdi
126	mul	q0
127	add	%rbp, %r10
128	lea	(%rax), %rbp
129	mov	24(mp,n,8), %rax
130	adc	%r9, %r11
131	mov	24(up,n,8), %rbx
132	lea	(%rdx), %r9
133	adc	$0, %rdi
134	mul	q0
135	add	%r11, %rbx
136	lea	(%rax), %r11
137	mov	32(mp,n,8), %rax
138	adc	%rdi, %rbp
139	mov	%rbx, 24(up,n,8)
140	mov	32(up,n,8), %r10
141	lea	(%rdx), %rdi
142	adc	$0, %r9
143	imul	u0inv, %rbx		C next q limb
144	add	$2, i
145	jns	L(ed1)
146
147	ALIGNx
148L(tp1):	mul	q0
149	add	%rbp, %r10
150	lea	(%rax), %rbp
151	mov	(mp,i,8), %rax
152	adc	%r9, %r11
153	mov	%r10, -8(up,i,8)
154	mov	(up,i,8), %r10
155	lea	(%rdx), %r9
156	adc	$0, %rdi
157	mul	q0
158	add	%r11, %r10
159	lea	(%rax), %r11
160	mov	8(mp,i,8), %rax
161	adc	%rdi, %rbp
162	mov	%r10, (up,i,8)
163	mov	8(up,i,8), %r10
164	lea	(%rdx), %rdi
165	adc	$0, %r9
166	add	$2, i
167	js	L(tp1)
168
169L(ed1):	mul	q0
170	add	%rbp, %r10
171	adc	%r9, %r11
172	mov	%r10, I(-8(up),-8(up,i,8))
173	mov	I((up),(up,i,8)), %r10
174	adc	$0, %rdi
175	add	%r11, %r10
176	adc	%rdi, %rax
177	mov	%r10, I((up),(up,i,8))
178	mov	I(8(up),8(up,i,8)), %r10
179	adc	$0, %rdx
180	add	%rax, %r10
181	mov	%r10, I(8(up),8(up,i,8))
182	adc	$0, %rdx
183	mov	%rdx, 16(up,n,8)	C up[0]
184	mov	%rbx, q0		C previously computed q limb -> q0
185	lea	8(up), up		C up++
186	dec	j
187	jnz	L(otp1)
188	jmp	L(cj)
189
190L(b0):	cmp	$-2, R32(n)
191	jz	L(n2)
192	cmp	$-4, R32(n)
193	jz	L(n4)
194
195	push	rp
196
197L(otp0):lea	4(n), i
198	mov	(mp,n,8), %rax
199	mul	q0
200	lea	(%rax), %r11
201	mov	8(mp,n,8), %rax
202	lea	(%rdx), %rdi
203	mul	q0
204	lea	(%rax), %rbp
205	mov	16(mp,n,8), %rax
206	mov	16(up,n,8), %r10
207	lea	(%rdx), %r9
208	mul	q0
209	add	%r11, %r10
210	lea	(%rax), %r11
211	mov	24(mp,n,8), %rax
212	adc	%rdi, %rbp
213	mov	24(up,n,8), %rbx
214	lea	(%rdx), %rdi
215	adc	$0, %r9
216	mul	q0
217	add	%rbp, %rbx
218	lea	(%rax), %rbp
219	mov	32(mp,n,8), %rax
220	adc	%r9, %r11
221	mov	%rbx, 24(up,n,8)
222	mov	32(up,n,8), %r10
223	lea	(%rdx), %r9
224	adc	$0, %rdi
225	imul	u0inv, %rbx		C next q limb
226	jmp	L(e0)
227
228	ALIGNx
229L(tp0):	mul	q0
230	add	%rbp, %r10
231	lea	(%rax), %rbp
232	mov	(mp,i,8), %rax
233	adc	%r9, %r11
234	mov	%r10, -8(up,i,8)
235	mov	(up,i,8), %r10
236	lea	(%rdx), %r9
237	adc	$0, %rdi
238L(e0):	mul	q0
239	add	%r11, %r10
240	lea	(%rax), %r11
241	mov	8(mp,i,8), %rax
242	adc	%rdi, %rbp
243	mov	%r10, (up,i,8)
244	mov	8(up,i,8), %r10
245	lea	(%rdx), %rdi
246	adc	$0, %r9
247	add	$2, i
248	js	L(tp0)
249
250L(ed0):	mul	q0
251	add	%rbp, %r10
252	adc	%r9, %r11
253	mov	%r10, I(-8(up),-8(up,i,8))
254	mov	I((up),(up,i,8)), %r10
255	adc	$0, %rdi
256	add	%r11, %r10
257	adc	%rdi, %rax
258	mov	%r10, I((up),(up,i,8))
259	mov	I(8(up),8(up,i,8)), %r10
260	adc	$0, %rdx
261	add	%rax, %r10
262	mov	%r10, I(8(up),8(up,i,8))
263	adc	$0, %rdx
264	mov	%rdx, 16(up,n,8)	C up[0]
265	mov	%rbx, q0		C previously computed q limb -> q0
266	lea	8(up), up		C up++
267	dec	j
268	jnz	L(otp0)
269
270L(cj):	lea	16(up), up		C FIXME
271	pop	rp
272L(add_n):
273IFSTD(`	lea	(up,n,8), up		C param 2: up
274	lea	(up,n,8), %rdx		C param 3: up - n
275	neg	R32(n)		')	C param 4: n
276
277IFDOS(`	lea	(up,n,8), %rdx		C param 2: up
278	lea	(%rdx,n,8), %r8		C param 3: up - n
279	neg	R32(n)
280	mov	n, %r9			C param 4: n
281	mov	rp, %rcx	')	C param 1: rp
282
283IFSTD(`	sub	$8, %rsp	')
284IFDOS(`	sub	$40, %rsp	')
285	ASSERT(nz, `test $15, %rsp')
286	CALL(	mpn_add_n)
287IFSTD(`	add	$8, %rsp	')
288IFDOS(`	add	$40, %rsp	')
289
290L(ret):	pop	%r15
291	pop	%r14
292	pop	%r13
293	pop	%r12
294	pop	%rbp
295	pop	%rbx
296	FUNC_EXIT()
297	ret
298
299L(n1):	mov	(mp_param), %rax
300	mul	q0
301	add	8(up), %rax
302	adc	16(up), %rdx
303	mov	%rdx, (rp)
304	mov	$0, R32(%rax)
305	adc	R32(%rax), R32(%rax)
306	jmp	L(ret)
307
308L(n2):	mov	(mp_param), %rax
309	mov	(up), %rbp
310	mul	q0
311	add	%rax, %rbp
312	mov	%rdx, %r9
313	adc	$0, %r9
314	mov	-8(mp), %rax
315	mov	8(up), %r10
316	mul	q0
317	add	%rax, %r10
318	mov	%rdx, %r11
319	adc	$0, %r11
320	add	%r9, %r10
321	adc	$0, %r11
322	mov	%r10, q0
323	imul	u0inv, q0		C next q0
324	mov	-16(mp), %rax
325	mul	q0
326	add	%rax, %r10
327	mov	%rdx, %r9
328	adc	$0, %r9
329	mov	-8(mp), %rax
330	mov	16(up), %r14
331	mul	q0
332	add	%rax, %r14
333	adc	$0, %rdx
334	add	%r9, %r14
335	adc	$0, %rdx
336	xor	R32(%rax), R32(%rax)
337	add	%r11, %r14
338	adc	24(up), %rdx
339	mov	%r14, (rp)
340	mov	%rdx, 8(rp)
341	adc	R32(%rax), R32(%rax)
342	jmp	L(ret)
343
344	ALIGNx
345L(n3):	mov	-24(mp), %rax
346	mov	-8(up), %r10
347	mul	q0
348	add	%rax, %r10
349	mov	-16(mp), %rax
350	mov	%rdx, %r11
351	adc	$0, %r11
352	mov	(up), %rbp
353	mul	q0
354	add	%rax, %rbp
355	mov	%rdx, %r9
356	adc	$0, %r9
357	mov	-8(mp), %rax
358	add	%r11, %rbp
359	mov	8(up), %r10
360	adc	$0, %r9
361	mul	q0
362	mov	%rbp, q0
363	imul	u0inv, q0		C next q0
364	add	%rax, %r10
365	mov	%rdx, %r11
366	adc	$0, %r11
367	mov	%rbp, (up)
368	add	%r9, %r10
369	adc	$0, %r11
370	mov	%r10, 8(up)
371	mov	%r11, -8(up)		C up[0]
372	lea	8(up), up		C up++
373	dec	j
374	jnz	L(n3)
375
376	mov	-32(up), %rdx
377	mov	-24(up), %rbx
378	xor	R32(%rax), R32(%rax)
379	add	%rbp, %rdx
380	adc	%r10, %rbx
381	adc	8(up), %r11
382	mov	%rdx, (rp)
383	mov	%rbx, 8(rp)
384	mov	%r11, 16(rp)
385	adc	R32(%rax), R32(%rax)
386	jmp	L(ret)
387
388	ALIGNx
389L(n4):	mov	-32(mp), %rax
390	mul	q0
391	lea	(%rax), %r11
392	mov	-24(mp), %rax
393	lea	(%rdx), %r14
394	mul	q0
395	lea	(%rax), %rbp
396	mov	-16(mp), %rax
397	mov	-16(up), %r10
398	lea	(%rdx), %r9
399	mul	q0
400	add	%r11, %r10
401	lea	(%rax), %r11
402	mov	-8(mp), %rax
403	adc	%r14, %rbp
404	mov	-8(up), %rbx
405	lea	(%rdx), %r14
406	adc	$0, %r9
407	mul	q0
408	add	%rbp, %rbx
409	adc	%r9, %r11
410	mov	%rbx, -8(up)
411	mov	(up), %r10
412	adc	$0, %r14
413	imul	u0inv, %rbx		C next q limb
414	add	%r11, %r10
415	adc	%r14, %rax
416	mov	%r10, (up)
417	mov	8(up), %r10
418	adc	$0, %rdx
419	add	%rax, %r10
420	mov	%r10, 8(up)
421	adc	$0, %rdx
422	mov	%rdx, -16(up)		C up[0]
423	mov	%rbx, q0		C previously computed q limb -> q0
424	lea	8(up), up		C up++
425	dec	j
426	jnz	L(n4)
427	lea	16(up), up
428	jmp	L(add_n)
429EPILOGUE()
430ASM_END()
431