1dnl  AMD64 mpn_addmul_2 optimised for Intel Haswell.
2
3dnl  Contributed to the GNU project by Torbj��rn Granlund.
4
5dnl  Copyright 2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb
36C AMD K8,K9	n/a
37C AMD K10	n/a
38C AMD bull	n/a
39C AMD pile	n/a
40C AMD steam	n/a
41C AMD excavator	 ?
42C AMD bobcat	n/a
43C AMD jaguar	n/a
44C Intel P4	n/a
45C Intel core	n/a
46C Intel NHM	n/a
47C Intel SBR	n/a
48C Intel IBR	n/a
49C Intel HWL	 2.15
50C Intel BWL	 2.33
51C Intel SKL	 2.22
52C Intel atom	n/a
53C Intel SLM	n/a
54C VIA nano	n/a
55
56C The loop of this code is the result of running a code generation and
57C optimisation tool suite written by David Harvey and Torbj��rn Granlund.
58
59define(`rp',     `%rdi')
60define(`up',     `%rsi')
61define(`n_param',`%rdx')
62define(`vp',     `%rcx')
63
64define(`v0', `%r8')
65define(`v1', `%r9')
66define(`w0', `%rbx')
67define(`w1', `%rcx')
68define(`w2', `%rbp')
69define(`w3', `%r10')
70define(`n',  `%r11')
71define(`X0', `%r12')
72define(`X1', `%r13')
73
74ABI_SUPPORT(DOS64)
75ABI_SUPPORT(STD64)
76
77ASM_START()
78	TEXT
79	ALIGN(32)
80PROLOGUE(mpn_addmul_2)
81	FUNC_ENTRY(4)
82	push	%rbx
83	push	%rbp
84	push	%r12
85	push	%r13
86
87	mov	(vp), v0
88	mov	8(vp), v1
89
90	mov	n_param, n
91	shr	$2, n
92
93	test	$1, R8(n_param)
94	jnz	L(bx1)
95
96L(bx0):	mov	(rp), X0
97	mov	8(rp), X1
98	test	$2, R8(n_param)
99	jnz	L(b10)
100
101L(b00):	mov	(up), %rdx
102	lea	16(up), up
103	mulx(	v0, %rax, w1)
104	add	%rax, X0
105	mulx(	v1, %rax, w2)
106	adc	$0, w1
107	mov	X0, (rp)
108	add	%rax, X1
109	adc	$0, w2
110	mov	-8(up), %rdx
111	lea	16(rp), rp
112	jmp	L(lo0)
113
114L(b10):	mov	(up), %rdx
115	inc	n
116	mulx(	v0, %rax, w1)
117	add	%rax, X0
118	adc	$0, w1
119	mulx(	v1, %rax, w2)
120	mov	X0, (rp)
121	mov	16(rp), X0
122	add	%rax, X1
123	adc	$0, w2
124	xor	w0, w0
125	jmp	L(lo2)
126
127L(bx1):	mov	(rp), X1
128	mov	8(rp), X0
129	test	$2, R8(n_param)
130	jnz	L(b11)
131
132L(b01):	mov	(up), %rdx
133	mulx(	v0, %rax, w3)
134	add	%rax, X1
135	adc	$0, w3
136	mulx(	v1, %rax, w0)
137	add	%rax, X0
138	adc	$0, w0
139	mov	8(up), %rdx
140	mov	X1, (rp)
141	mov	16(rp), X1
142	mulx(	v0, %rax, w1)
143	lea	24(rp), rp
144	lea	24(up), up
145	jmp	L(lo1)
146
147L(b11):	mov	(up), %rdx
148	inc	n
149	mulx(	v0, %rax, w3)
150	add	%rax, X1
151	adc	$0, w3
152	mulx(	v1, %rax, w0)
153	add	%rax, X0
154	adc	$0, w0
155	mov	X1, (rp)
156	mov	8(up), %rdx
157	mulx(	v0, %rax, w1)
158	lea	8(rp), rp
159	lea	8(up), up
160	jmp	L(lo3)
161
162	ALIGN(16)
163L(top):	mulx(	v0, %rax, w3)
164	add	w0, X1
165	adc	$0, w2
166	add	%rax, X1
167	adc	$0, w3
168	mulx(	v1, %rax, w0)
169	add	%rax, X0
170	adc	$0, w0
171	lea	32(rp), rp
172	add	w1, X1
173	mov	-16(up), %rdx
174	mov	X1, -24(rp)
175	adc	$0, w3
176	add	w2, X0
177	mov	-8(rp), X1
178	mulx(	v0, %rax, w1)
179	adc	$0, w0
180L(lo1):	add	%rax, X0
181	mulx(	v1, %rax, w2)
182	adc	$0, w1
183	add	w3, X0
184	mov	X0, -16(rp)
185	adc	$0, w1
186	add	%rax, X1
187	adc	$0, w2
188	add	w0, X1
189	mov	-8(up), %rdx
190	adc	$0, w2
191L(lo0):	mulx(	v0, %rax, w3)
192	add	%rax, X1
193	adc	$0, w3
194	mov	(rp), X0
195	mulx(	v1, %rax, w0)
196	add	%rax, X0
197	adc	$0, w0
198	add	w1, X1
199	mov	X1, -8(rp)
200	adc	$0, w3
201	mov	(up), %rdx
202	add	w2, X0
203	mulx(	v0, %rax, w1)
204	adc	$0, w0
205L(lo3):	add	%rax, X0
206	adc	$0, w1
207	mulx(	v1, %rax, w2)
208	add	w3, X0
209	mov	8(rp), X1
210	mov	X0, (rp)
211	mov	16(rp), X0
212	adc	$0, w1
213	add	%rax, X1
214	adc	$0, w2
215L(lo2):	mov	8(up), %rdx
216	lea	32(up), up
217	dec	n
218	jnz	L(top)
219
220L(end):	mulx(	v0, %rax, w3)
221	add	w0, X1
222	adc	$0, w2
223	add	%rax, X1
224	adc	$0, w3
225	mulx(	v1, %rdx, %rax)
226	add	w1, X1
227	mov	X1, 8(rp)
228	adc	$0, w3
229	add	w2, %rdx
230	adc	$0, %rax
231	add	w3, %rdx
232	mov	%rdx, 16(rp)
233	adc	$0, %rax
234
235	pop	%r13
236	pop	%r12
237	pop	%rbp
238	pop	%rbx
239	FUNC_EXIT()
240	ret
241EPILOGUE()
242