1dnl  AMD64 mpn_add_n, mpn_sub_n optimised for Sandy bridge, Ivy bridge, and
2dnl  Haswell.
3
4dnl  Contributed to the GNU project by Torbj��rn Granlund.
5
6dnl  Copyright 2003-2005, 2007, 2008, 2010-2013 Free Software Foundation, Inc.
7
8dnl  This file is part of the GNU MP Library.
9dnl
10dnl  The GNU MP Library is free software; you can redistribute it and/or modify
11dnl  it under the terms of either:
12dnl
13dnl    * the GNU Lesser General Public License as published by the Free
14dnl      Software Foundation; either version 3 of the License, or (at your
15dnl      option) any later version.
16dnl
17dnl  or
18dnl
19dnl    * the GNU General Public License as published by the Free Software
20dnl      Foundation; either version 2 of the License, or (at your option) any
21dnl      later version.
22dnl
23dnl  or both in parallel, as here.
24dnl
25dnl  The GNU MP Library is distributed in the hope that it will be useful, but
26dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
28dnl  for more details.
29dnl
30dnl  You should have received copies of the GNU General Public License and the
31dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
32dnl  see https://www.gnu.org/licenses/.
33
34include(`../config.m4')
35
36C	    cycles/limb
37C AMD K8,K9	 1.75\2.52
38C AMD K10	 1.5
39C AMD bd1	 1.69\2.25
40C AMD bd2	 1.65
41C AMD bd3	 ?
42C AMD bd4	 ?
43C AMD zen	 1.5
44C AMD bt1	 2.67
45C AMD bt2	 2.16
46C Intel P4	11.54
47C Intel PNR	 5
48C Intel NHM	 5.5
49C Intel SBR	 1.54
50C Intel IBR	 1.5
51C Intel HWL	 1.32
52C Intel BWL	 1.07
53C Intel SKL	 1.21
54C Intel atom	 4.3
55C Intel SLM	 3
56C VIA nano	 ?
57
58C The loop of this code was manually written.  It runs close to optimally on
59C Intel SBR, IBR, and HWL far as we know, except for the fluctuation problems.
60C It also runs slightly faster on average on AMD bd1 and bd2.
61C
62C No micro-optimisation has been done.
63C
64C N.B.!  The loop alignment padding insns are executed.  If editing the code,
65C make sure the padding does not become excessive.  It is now a 4-byte nop.
66
67define(`rp',	`%rdi')	C rcx
68define(`up',	`%rsi')	C rdx
69define(`vp',	`%rdx')	C r8
70define(`n',	`%rcx')	C r9
71define(`cy',	`%r8')	C rsp+40    (mpn_add_nc and mpn_sub_nc)
72
73ifdef(`OPERATION_add_n', `
74  define(ADCSBB,    adc)
75  define(func,      mpn_add_n)
76  define(func_nc,   mpn_add_nc)')
77ifdef(`OPERATION_sub_n', `
78  define(ADCSBB,    sbb)
79  define(func,      mpn_sub_n)
80  define(func_nc,   mpn_sub_nc)')
81
82MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
83
84ABI_SUPPORT(DOS64)
85ABI_SUPPORT(STD64)
86
87ASM_START()
88	TEXT
89	ALIGN(32)
90PROLOGUE(func)
91	FUNC_ENTRY(4)
92	xor	%r8, %r8
93
94L(ent):	mov	R32(n), R32(%rax)
95	shr	$2, n
96
97	test	$1, R8(%rax)
98	jnz	L(bx1)
99
100L(bx0):	test	$2, R8(%rax)
101	jnz	L(b10)
102
103L(b00):	neg	%r8
104	mov	(up), %r8
105	mov	8(up), %r9
106	ADCSBB	(vp), %r8
107	ADCSBB	8(vp), %r9
108	mov	16(up), %r10
109	mov	24(up), %r11
110	lea	32(up), up
111	ADCSBB	16(vp), %r10
112	ADCSBB	24(vp), %r11
113	lea	32(vp), vp
114	lea	-16(rp), rp
115	jmp	L(lo0)
116
117L(b10):	neg	%r8
118	mov	(up), %r10
119	mov	8(up), %r11
120	ADCSBB	0(vp), %r10
121	ADCSBB	8(vp), %r11
122	jrcxz	L(e2)
123	mov	16(up), %r8
124	mov	24(up), %r9
125	lea	16(up), up
126	ADCSBB	16(vp), %r8
127	ADCSBB	24(vp), %r9
128	lea	16(vp), vp
129C	lea	(rp), rp
130	jmp	L(lo2)
131
132L(e2):	mov	%r10, (rp)
133	mov	%r11, 8(rp)
134	setc	R8(%rax)
135	FUNC_EXIT()
136	ret
137
138L(bx1):	test	$2, R8(%rax)
139	jnz	L(b11)
140
141L(b01):	neg	%r8
142	mov	(up), %r11
143	ADCSBB	(vp), %r11
144	jrcxz	L(e1)
145	mov	8(up), %r8
146	mov	16(up), %r9
147	lea	8(up), up
148	lea	-8(rp), rp
149	ADCSBB	8(vp), %r8
150	ADCSBB	16(vp), %r9
151	lea	8(vp), vp
152	jmp	L(lo1)
153
154L(e1):	mov	%r11, (rp)
155	setc	R8(%rax)
156	FUNC_EXIT()
157	ret
158
159L(b11):	neg	%r8
160	mov	(up), %r9
161	ADCSBB	(vp), %r9
162	mov	8(up), %r10
163	mov	16(up), %r11
164	lea	24(up), up
165	ADCSBB	8(vp), %r10
166	ADCSBB	16(vp), %r11
167	lea	24(vp), vp
168	mov	%r9, (rp)
169	lea	8(rp), rp
170	jrcxz	L(end)
171
172	ALIGN(32)
173L(top):	mov	(up), %r8
174	mov	8(up), %r9
175	ADCSBB	(vp), %r8
176	ADCSBB	8(vp), %r9
177L(lo2):	mov	%r10, (rp)
178L(lo1):	mov	%r11, 8(rp)
179	mov	16(up), %r10
180	mov	24(up), %r11
181	lea	32(up), up
182	ADCSBB	16(vp), %r10
183	ADCSBB	24(vp), %r11
184	lea	32(vp), vp
185L(lo0):	mov	%r8, 16(rp)
186L(lo3):	mov	%r9, 24(rp)
187	lea	32(rp), rp
188	dec	n
189	jnz	L(top)
190
191L(end):	mov	R32(n), R32(%rax)	C zero rax
192	mov	%r10, (rp)
193	mov	%r11, 8(rp)
194	setc	R8(%rax)
195	FUNC_EXIT()
196	ret
197EPILOGUE()
198	ALIGN(16)
199PROLOGUE(func_nc)
200	FUNC_ENTRY(4)
201IFDOS(`	mov	56(%rsp), %r8	')
202	jmp	L(ent)
203EPILOGUE()
204