1dnl  AMD64 mpn_redc_1 optimised for Intel Haswell.
2
3dnl  Contributed to the GNU project by Torbj��rn Granlund.
4
5dnl  Copyright 2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb
36C AMD K8,K9	n/a
37C AMD K10	n/a
38C AMD bull	n/a
39C AMD pile	n/a
40C AMD steam	 ?
41C AMD bobcat	n/a
42C AMD jaguar	 ?
43C Intel P4	n/a
44C Intel core	n/a
45C Intel NHM	n/a
46C Intel SBR	n/a
47C Intel IBR	n/a
48C Intel HWL	 2.32
49C Intel BWL	 ?
50C Intel atom	n/a
51C VIA nano	n/a
52
53C The inner loops of this code are the result of running a code generation and
54C optimisation tool suite written by David Harvey and Torbj��rn Granlund.
55
56C TODO
57C  * Micro-optimise.
58C  * Consider inlining mpn_add_n.  Tests indicate that this saves just 1-2
59C    cycles, though.
60
61define(`rp',          `%rdi')   C rcx
62define(`up',          `%rsi')   C rdx
63define(`mp_param',    `%rdx')   C r8
64define(`n',           `%rcx')   C r9
65define(`u0inv_param', `%r8')    C stack
66
67define(`i',           `%r14')
68define(`j',           `%r15')
69define(`mp',          `%rdi')
70define(`u0inv',       `(%rsp)')  C stack
71
72ABI_SUPPORT(DOS64)    C FIXME: needs verification
73ABI_SUPPORT(STD64)
74
75ASM_START()
76	TEXT
77	ALIGN(16)
78PROLOGUE(mpn_redc_1)
79	FUNC_ENTRY(4)
80IFDOS(`	mov	56(%rsp), %r8	')
81	push	%rbx
82	push	%rbp
83	push	%r12
84	push	%r13
85	push	%r14
86	push	%r15
87	push	rp
88	mov	mp_param, mp		C note that rp and mp shares register
89	mov	(up), %rdx
90
91	neg	n
92	push	%r8			C put u0inv on stack
93	imul	u0inv_param, %rdx	C first iteration q0
94	mov	n, j			C outer loop induction var
95
96	test	$1, R8(n)
97	jnz	L(bx1)
98
99L(bx0):	test	$2, R8(n)
100	jz	L(o0b)
101
102	cmp	$-2, R32(n)
103	jnz	L(o2)
104
105C Special code for n = 2 since general code cannot handle it
106	mov	8(%rsp), %rbx		C rp
107	lea	16(%rsp), %rsp		C deallocate two slots
108	mulx(	(mp), %r9, %r12)
109	mulx(	8,(mp), %r11, %r10)
110	add	%r12, %r11
111	adc	$0, %r10
112	add	(up), %r9		C = 0
113	adc	8(up), %r11		C r11 = up[1]
114	adc	$0, %r10		C -> up[0]
115	mov	%r11, %rdx
116	imul	u0inv_param, %rdx
117	mulx(	(mp), %r13, %r12)
118	mulx(	8,(mp), %r14, %r15)
119	xor	R32(%rax), R32(%rax)
120	add	%r12, %r14
121	adc	$0, %r15
122	add	%r11, %r13		C = 0
123	adc	16(up), %r14		C rp[2]
124	adc	$0, %r15		C -> up[1]
125	add	%r14, %r10
126	adc	24(up), %r15
127	mov	%r10, (%rbx)
128	mov	%r15, 8(%rbx)
129	setc	R8(%rax)
130	jmp	L(ret)
131
132L(o2):	lea	2(n), i			C inner loop induction var
133	mulx(	(mp), %r9, %r8)
134	mulx(	8,(mp), %r11, %r10)
135	sar	$2, i
136	add	%r8, %r11
137	jmp	L(lo2)
138
139	ALIGN(16)
140L(tp2):	adc	%rax, %r9
141	lea	32(up), up
142	adc	%r8, %r11
143L(lo2):	mulx(	16,(mp), %r13, %r12)
144	mov	(up), %r8
145	mulx(	24,(mp), %rbx, %rax)
146	lea	32(mp), mp
147	adc	%r10, %r13
148	adc	%r12, %rbx
149	adc	$0, %rax
150	mov	8(up), %r10
151	mov	16(up), %r12
152	add	%r9, %r8
153	mov	24(up), %rbp
154	mov	%r8, (up)
155	adc	%r11, %r10
156	mulx(	(mp), %r9, %r8)
157	mov	%r10, 8(up)
158	adc	%r13, %r12
159	mov	%r12, 16(up)
160	adc	%rbx, %rbp
161	mulx(	8,(mp), %r11, %r10)
162	mov	%rbp, 24(up)
163	inc	i
164	jnz	L(tp2)
165
166L(ed2):	mov	56(up,n,8), %rdx	C next iteration up[0]
167	lea	16(mp,n,8), mp		C mp = (last starting mp)
168	adc	%rax, %r9
169	adc	%r8, %r11
170	mov	32(up), %r8
171	adc	$0, %r10
172	imul	u0inv, %rdx		C next iteration q0
173	mov	40(up), %rax
174	add	%r9, %r8
175	mov	%r8, 32(up)
176	adc	%r11, %rax
177	mov	%rax, 40(up)
178	lea	56(up,n,8), up		C up = (last starting up) + 1
179	adc	$0, %r10
180	mov	%r10, -8(up)
181	inc	j
182	jnz	L(o2)
183
184	jmp	L(cj)
185
186
187L(bx1):	test	$2, R8(n)
188	jz	L(o3a)
189
190L(o1a):	cmp	$-1, R32(n)
191	jnz	L(o1b)
192
193C Special code for n = 1 since general code cannot handle it
194	mov	8(%rsp), %rbx		C rp
195	lea	16(%rsp), %rsp		C deallocate two slots
196	mulx(	(mp), %r11, %r10)
197	add	(up), %r11
198	adc	8(up), %r10
199	mov	%r10, (%rbx)
200	mov	$0, R32(%rax)
201	setc	R8(%rax)
202	jmp	L(ret)
203
204L(o1b):	lea	24(mp), mp
205L(o1):	lea	1(n), i			C inner loop induction var
206	mulx(	-24,(mp), %r11, %r10)
207	mulx(	-16,(mp), %r13, %r12)
208	mulx(	-8,(mp), %rbx, %rax)
209	sar	$2, i
210	add	%r10, %r13
211	adc	%r12, %rbx
212	adc	$0, %rax
213	mov	(up), %r10
214	mov	8(up), %r12
215	mov	16(up), %rbp
216	add	%r11, %r10
217	jmp	L(lo1)
218
219	ALIGN(16)
220L(tp1):	adc	%rax, %r9
221	lea	32(up), up
222	adc	%r8, %r11
223	mulx(	16,(mp), %r13, %r12)
224	mov	-8(up), %r8
225	mulx(	24,(mp), %rbx, %rax)
226	lea	32(mp), mp
227	adc	%r10, %r13
228	adc	%r12, %rbx
229	adc	$0, %rax
230	mov	(up), %r10
231	mov	8(up), %r12
232	add	%r9, %r8
233	mov	16(up), %rbp
234	mov	%r8, -8(up)
235	adc	%r11, %r10
236L(lo1):	mulx(	(mp), %r9, %r8)
237	mov	%r10, (up)
238	adc	%r13, %r12
239	mov	%r12, 8(up)
240	adc	%rbx, %rbp
241	mulx(	8,(mp), %r11, %r10)
242	mov	%rbp, 16(up)
243	inc	i
244	jnz	L(tp1)
245
246L(ed1):	mov	48(up,n,8), %rdx	C next iteration up[0]
247	lea	40(mp,n,8), mp		C mp = (last starting mp)
248	adc	%rax, %r9
249	adc	%r8, %r11
250	mov	24(up), %r8
251	adc	$0, %r10
252	imul	u0inv, %rdx		C next iteration q0
253	mov	32(up), %rax
254	add	%r9, %r8
255	mov	%r8, 24(up)
256	adc	%r11, %rax
257	mov	%rax, 32(up)
258	lea	48(up,n,8), up		C up = (last starting up) + 1
259	adc	$0, %r10
260	mov	%r10, -8(up)
261	inc	j
262	jnz	L(o1)
263
264	jmp	L(cj)
265
266L(o3a):	cmp	$-3, R32(n)
267	jnz	L(o3b)
268
269C Special code for n = 3 since general code cannot handle it
270L(n3):	mulx(	(mp), %rbx, %rax)
271	mulx(	8,(mp), %r9, %r14)
272	add	(up), %rbx
273	mulx(	16,(mp), %r11, %r10)
274	adc	%rax, %r9		C W 1
275	adc	%r14, %r11		C W 2
276	mov	8(up), %r14
277	mov	u0inv_param, %rdx
278	adc	$0, %r10		C W 3
279	mov	16(up), %rax
280	add	%r9, %r14		C W 1
281	mov	%r14, 8(up)
282	mulx(	%r14, %rdx, %r13)	C next iteration q0
283	adc	%r11, %rax		C W 2
284	mov	%rax, 16(up)
285	adc	$0, %r10		C W 3
286	mov	%r10, (up)
287	lea	8(up), up		C up = (last starting up) + 1
288	inc	j
289	jnz	L(n3)
290
291	jmp	L(cj)
292
293L(o3b):	lea	8(mp), mp
294L(o3):	lea	4(n), i			C inner loop induction var
295	mulx(	-8,(mp), %rbx, %rax)
296	mulx(	(mp), %r9, %r8)
297	mov	(up), %rbp
298	mulx(	8,(mp), %r11, %r10)
299	sar	$2, i
300	add	%rbx, %rbp
301	nop
302	adc	%rax, %r9
303	jmp	L(lo3)
304
305	ALIGN(16)
306L(tp3):	adc	%rax, %r9
307	lea	32(up), up
308L(lo3):	adc	%r8, %r11
309	mulx(	16,(mp), %r13, %r12)
310	mov	8(up), %r8
311	mulx(	24,(mp), %rbx, %rax)
312	lea	32(mp), mp
313	adc	%r10, %r13
314	adc	%r12, %rbx
315	adc	$0, %rax
316	mov	16(up), %r10
317	mov	24(up), %r12
318	add	%r9, %r8
319	mov	32(up), %rbp
320	mov	%r8, 8(up)
321	adc	%r11, %r10
322	mulx(	(mp), %r9, %r8)
323	mov	%r10, 16(up)
324	adc	%r13, %r12
325	mov	%r12, 24(up)
326	adc	%rbx, %rbp
327	mulx(	8,(mp), %r11, %r10)
328	mov	%rbp, 32(up)
329	inc	i
330	jnz	L(tp3)
331
332L(ed3):	mov	64(up,n,8), %rdx	C next iteration up[0]
333	lea	24(mp,n,8), mp		C mp = (last starting mp)
334	adc	%rax, %r9
335	adc	%r8, %r11
336	mov	40(up), %r8
337	adc	$0, %r10
338	imul	u0inv, %rdx		C next iteration q0
339	mov	48(up), %rax
340	add	%r9, %r8
341	mov	%r8, 40(up)
342	adc	%r11, %rax
343	mov	%rax, 48(up)
344	lea	64(up,n,8), up		C up = (last starting up) + 1
345	adc	$0, %r10
346	mov	%r10, -8(up)
347	inc	j
348	jnz	L(o3)
349
350	jmp	L(cj)
351
352L(o0b):	lea	16(mp), mp
353L(o0):	mov	n, i			C inner loop induction var
354	mulx(	-16,(mp), %r13, %r12)
355	mulx(	-8,(mp), %rbx, %rax)
356	sar	$2, i
357	add	%r12, %rbx
358	adc	$0, %rax
359	mov	(up), %r12
360	mov	8(up), %rbp
361	mulx(	(mp), %r9, %r8)
362	add	%r13, %r12
363	jmp	L(lo0)
364
365	ALIGN(16)
366L(tp0):	adc	%rax, %r9
367	lea	32(up), up
368	adc	%r8, %r11
369	mulx(	16,(mp), %r13, %r12)
370	mov	-16(up), %r8
371	mulx(	24,(mp), %rbx, %rax)
372	lea	32(mp), mp
373	adc	%r10, %r13
374	adc	%r12, %rbx
375	adc	$0, %rax
376	mov	-8(up), %r10
377	mov	(up), %r12
378	add	%r9, %r8
379	mov	8(up), %rbp
380	mov	%r8, -16(up)
381	adc	%r11, %r10
382	mulx(	(mp), %r9, %r8)
383	mov	%r10, -8(up)
384	adc	%r13, %r12
385	mov	%r12, (up)
386L(lo0):	adc	%rbx, %rbp
387	mulx(	8,(mp), %r11, %r10)
388	mov	%rbp, 8(up)
389	inc	i
390	jnz	L(tp0)
391
392L(ed0):	mov	40(up,n,8), %rdx	C next iteration up[0]
393	lea	32(mp,n,8), mp		C mp = (last starting mp)
394	adc	%rax, %r9
395	adc	%r8, %r11
396	mov	16(up), %r8
397	adc	$0, %r10
398	imul	u0inv, %rdx		C next iteration q0
399	mov	24(up), %rax
400	add	%r9, %r8
401	mov	%r8, 16(up)
402	adc	%r11, %rax
403	mov	%rax, 24(up)
404	lea	40(up,n,8), up		C up = (last starting up) + 1
405	adc	$0, %r10
406	mov	%r10, -8(up)
407	inc	j
408	jnz	L(o0)
409
410L(cj):
411IFSTD(`	mov	8(%rsp), %rdi		C param 1: rp
412	lea	16-8(%rsp), %rsp	C deallocate 2, add back for alignment
413	lea	(up,n,8), %rdx		C param 3: up - n
414	neg	R32(n)		')	C param 4: n
415
416IFDOS(`	mov	up, %rdx		C param 2: up
417	lea	(up,n,8), %r8		C param 3: up - n
418	neg	R32(n)
419	mov	n, %r9			C param 4: n
420	mov	8(%rsp), %rcx		C param 1: rp
421	lea	16-32-8(%rsp), %rsp')	C deallocate 2, allocate shadow, align
422
423	ASSERT(nz, `test $15, %rsp')
424	CALL(	mpn_add_n)
425
426IFSTD(`	lea	8(%rsp), %rsp	')
427IFDOS(`	lea	32+8(%rsp), %rsp')
428
429L(ret):	pop	%r15
430	pop	%r14
431	pop	%r13
432	pop	%r12
433	pop	%rbp
434	pop	%rbx
435	FUNC_EXIT()
436	ret
437EPILOGUE()
438