1dnl  AMD64 mpn_addmul_2 optimised for AMD Bulldozer.
2
3dnl  Copyright 2017 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb
34C AMD K8,K9
35C AMD K10
36C AMD bd1	 4.2
37C AMD bd2	 4.4
38C AMD bd3
39C AMD bd4
40C AMD zen
41C AMD bt1
42C AMD bt2
43C Intel P4
44C Intel PNR
45C Intel NHM
46C Intel SBR
47C Intel IBR
48C Intel HWL
49C Intel BWL
50C Intel SKL
51C Intel atom
52C Intel SLM
53C VIA nano
54
55C The loop of this code is the result of running a code generation and
56C optimisation tool suite written by David Harvey and Torbjorn Granlund.
57
58define(`rp',      `%rdi')   C rcx
59define(`up',      `%rsi')   C rdx
60define(`n_param', `%rdx')   C r8
61define(`vp',      `%rcx')   C r9
62
63define(`n',       `%rcx')
64define(`v0',      `%rbx')
65define(`v1',      `%rbp')
66define(`X0',      `%r12')
67define(`X1',      `%r13')
68
69define(`w0',    `%r8')
70define(`w1',    `%r9')
71define(`w2',    `%r10')
72define(`w3',    `%r11')
73
74ABI_SUPPORT(DOS64)
75ABI_SUPPORT(STD64)
76
77ASM_START()
78	TEXT
79	ALIGN(32)
80PROLOGUE(mpn_addmul_2)
81	FUNC_ENTRY(4)
82	push	%rbx
83	push	%rbp
84	push	%r12
85	push	%r13
86
87	mov	(vp), v0
88	mov	8(vp), v1
89
90	mov	(up), %rax
91	mov	$0, R32(w2)		C abuse w2
92
93	lea	(up,n_param,8), up
94	lea	(rp,n_param,8), rp
95	sub	n_param, w2
96	mul	v0
97
98	test	$1, R8(w2)
99	jnz	L(bx1)
100
101L(bx0):	mov	%rdx, X0
102	mov	%rax, X1
103	test	$2, R8(w2)
104	jnz	L(b10)
105
106L(b00):	lea	(w2), n			C un = 4, 8, 12, ...
107	mov	(up,w2,8), %rax
108	mov	(rp,w2,8), w3
109	mul	v1
110	mov	%rax, w0
111	mov	8(up,w2,8), %rax
112	mov	%rdx, w1
113	jmp	L(lo0)
114
115L(b10):	lea	2(w2), n		C un = 2, 6, 10, ...
116	mov	(up,w2,8), %rax
117	mov	(rp,w2,8), w1
118	mul	v1
119	mov	%rdx, w3
120	mov	%rax, w2
121	mov	-8(up,n,8), %rax
122	test	n, n
123	jz	L(end)
124	jmp	L(top)
125
126L(bx1):	mov	%rax, X0
127	mov	%rdx, X1
128	test	$2, R8(w2)
129	jz	L(b11)
130
131L(b01):	lea	1(w2), n		C un = 1, 5, 9, ...
132	mov	(up,w2,8), %rax
133	mul	v1
134	mov	(rp,w2,8), w2
135	mov	%rdx, w0
136	mov	%rax, w3
137	jmp	L(lo1)
138
139L(b11):	lea	-1(w2), n		C un = 3, 7, 11, ...
140	mov	(up,w2,8), %rax
141	mul	v1
142	mov	(rp,w2,8), w0
143	mov	%rax, w1
144	mov	8(up,w2,8), %rax
145	mov	%rdx, w2
146	jmp	L(lo3)
147
148	ALIGN(32)
149L(top):
150L(lo2):	mul	v0
151	add	w1, X1
152	mov	X1, -16(rp,n,8)
153	mov	%rdx, X1
154	adc	%rax, X0
155	adc	$0, X1
156	mov	-8(up,n,8), %rax
157	mul	v1
158	mov	-8(rp,n,8), w1
159	mov	%rdx, w0
160	add	w1, w2
161	adc	%rax, w3
162	adc	$0, w0
163L(lo1):	mov	(up,n,8), %rax
164	mul	v0
165	add	w2, X0
166	mov	X0, -8(rp,n,8)
167	mov	%rdx, X0
168	adc	%rax, X1
169	mov	(up,n,8), %rax
170	adc	$0, X0
171	mov	(rp,n,8), w2
172	mul	v1
173	add	w2, w3
174	adc	%rax, w0
175	mov	8(up,n,8), %rax
176	mov	%rdx, w1
177	adc	$0, w1
178L(lo0):	mul	v0
179	add	w3, X1
180	mov	X1, (rp,n,8)
181	adc	%rax, X0
182	mov	8(up,n,8), %rax
183	mov	%rdx, X1
184	adc	$0, X1
185	mov	8(rp,n,8), w3
186	mul	v1
187	add	w3, w0
188	adc	%rax, w1
189	mov	16(up,n,8), %rax
190	mov	%rdx, w2
191	adc	$0, w2
192L(lo3):	mul	v0
193	add	w0, X0
194	mov	X0, 8(rp,n,8)
195	mov	%rdx, X0
196	adc	%rax, X1
197	adc	$0, X0
198	mov	16(up,n,8), %rax
199	mov	16(rp,n,8), w0
200	mul	v1
201	mov	%rdx, w3
202	add	w0, w1
203	adc	%rax, w2
204	adc	$0, w3
205	mov	24(up,n,8), %rax
206	add	$4, n
207	jnc	L(top)
208
209L(end):	mul	v0
210	add	w1, X1
211	mov	X1, -16(rp)
212	mov	%rdx, X1
213	adc	%rax, X0
214	adc	$0, X1
215	mov	-8(up), %rax
216	mul	v1
217	mov	-8(rp), w1
218	add	w1, w2
219	adc	%rax, w3
220	adc	$0, %rdx
221	add	w2, X0
222	adc	$0, X1
223	mov	X0, -8(rp)
224	add	w3, X1
225	mov	X1, (rp)
226	adc	$0, %rdx
227	mov	%rdx, %rax
228
229	pop	%r13
230	pop	%r12
231	pop	%rbp
232	pop	%rbx
233	FUNC_EXIT()
234	ret
235EPILOGUE()
236