1dnl  AMD64 mpn_mul_1 optimised for AMD bt1/bt2.
2
3dnl  Copyright 2003-2005, 2007, 2008, 2011, 2012, 2019 Free Software
4dnl  Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34C	     cycles/limb
35C AMD K8,K9	 4.53		old measurement
36C AMD K10	 4.53		old measurement
37C AMD bd1	 4.56		old measurement
38C AMD bd2	 4.47		old measurement
39C AMD bd3	 ?
40C AMD bd4	 ?
41C AMD zen	 ?
42C AMD bt1	 5.12
43C AMD bt2	 5.17
44C Intel P4	12.6		old measurement
45C Intel PNR	 4.53		old measurement
46C Intel NHM	 4.36		old measurement
47C Intel SBR	 3.0		old measurement
48C Intel IBR	 2.55		old measurement
49C Intel HWL	 2.28		old measurement
50C Intel BWL	 2.36		old measurement
51C Intel SKL	 2.39		old measurement
52C Intel atom	21.0		old measurement
53C Intel SLM	 9		old measurement
54C Intel GLM	 ?
55C VIA nano	 ?
56
57C The loop of this code is the result of running a code generation and
58C optimisation tool suite written by David Harvey and Torbjorn Granlund.
59
60ABI_SUPPORT(DOS64)
61ABI_SUPPORT(STD64)
62
63C Standard parameters
64define(`rp',              `%rdi')
65define(`up',              `%rsi')
66define(`n_param',         `%rdx')
67define(`v0',              `%rcx')
68define(`cy',              `%r8')
69C Standard allocations
70define(`n',               `%rbx')
71define(`w0',              `%r8')
72define(`w1',              `%r9')
73define(`w2',              `%r10')
74define(`w3',              `%r11')
75
76C DOS64 parameters
77IFDOS(` define(`rp',      `%rcx')    ') dnl
78IFDOS(` define(`up',      `%rsi')    ') dnl
79IFDOS(` define(`n_param', `%r8')     ') dnl
80IFDOS(` define(`v0',      `%r9')     ') dnl
81IFDOS(` define(`cy',      `56(%rsp)')') dnl
82C DOS64 allocations
83IFDOS(` define(`n',       `%rbx')    ') dnl
84IFDOS(` define(`w0',      `%r8')     ') dnl
85IFDOS(` define(`w1',      `%rdi')    ') dnl
86IFDOS(` define(`w2',      `%r10')    ') dnl
87IFDOS(` define(`w3',      `%r11')    ') dnl
88
89	ALIGN(64)
90PROLOGUE(mpn_mul_1)
91IFDOS(`	push	%rsi		')
92IFDOS(`	push	%rdi		')
93IFDOS(`	mov	%rdx, %rsi	')
94
95	push	%rbx
96	mov	(up), %rax
97
98	lea	(rp,n_param,8), rp
99	lea	(up,n_param,8), up
100	mov	n_param, n
101
102	test	$1, R8(n_param)
103	jne	L(bx1)
104
105L(bx0):	mul	v0
106	neg	n
107	mov	%rax, w0
108	mov	%rdx, w1
109	test	$2, R8(n)
110	jne	L(L2)
111
112L(b00):	add	$2, n
113	jmp	L(L0)
114
115	ALIGN(16)
116L(b11):	mov	%rax, w2
117	mov	%rdx, w3
118	neg	n
119	inc	n
120	jmp	L(L3)
121
122	ALIGN(16)
123L(bx1):	mul	v0
124	test	$2, R8(n)
125	jne	L(b11)
126
127L(b01):	sub	$3, n
128	jc	L(n1)
129	mov	%rax, w2
130	mov	%rdx, w3
131	neg	n
132
133	ALIGN(16)
134L(top):	mov	-16(up,n,8), %rax
135	mul	v0
136	mov	%rax, w0
137	mov	%rdx, w1
138	mov	w2, -24(rp,n,8)
139	add	w3, w0
140	adc	$0, w1
141L(L0):	mov	-8(up,n,8), %rax
142	mul	v0
143	mov	%rax, w2
144	mov	%rdx, w3
145	mov	w0, -16(rp,n,8)
146	add	w1, w2
147	adc	$0, w3
148L(L3):	mov	(up,n,8), %rax
149	mul	v0
150	mov	%rax, w0
151	mov	%rdx, w1
152	mov	w2, -8(rp,n,8)
153	add	w3, w0
154	adc	$0, w1
155L(L2):	mov	8(up,n,8), %rax
156	mul	v0
157	mov	%rax, w2
158	mov	%rdx, w3
159	mov	w0, (rp,n,8)
160	add	w1, w2
161	adc	$0, w3
162	add	$4, n
163	js	L(top)
164
165L(end):	mov	w2, -8(rp)
166	mov	w3, %rax
167	pop	%rbx
168IFDOS(`	pop	%rdi		')
169IFDOS(`	pop	%rsi		')
170	ret
171
172	ALIGN(32)
173L(n1):	mov	%rax, -8(rp)
174	mov	%rdx, %rax
175	pop	%rbx
176IFDOS(`	pop	%rdi		')
177IFDOS(`	pop	%rsi		')
178	ret
179EPILOGUE()
180
181ASM_START()
182	TEXT
183	ALIGN(64)
184PROLOGUE(mpn_mul_1c)
185IFDOS(`	push	%rsi		')
186IFDOS(`	push	%rdi		')
187IFDOS(`	mov	%rdx, %rsi	')
188	mov	cy, w2
189	push	%rbx
190	mov	(up), %rax
191
192	lea	(rp,n_param,8), rp
193	lea	(up,n_param,8), up
194	mov	n_param, n
195
196	test	$1, R8(n_param)
197	jne	L(cx1)
198
199L(cx0):	mul	v0
200	neg	n
201	mov	%rax, w0
202	mov	%rdx, w1
203	add	w2, w0
204	adc	$0, w1
205	test	$2, R8(n)
206	jne	L(L2)
207
208L(c00):	add	$2, n
209	jmp	L(L0)
210
211	ALIGN(16)
212L(cx1):	mul	v0
213	test	$2, R8(n)
214	je	L(c01)
215
216L(c11):	neg	n
217	inc	n
218	add	%rax, w2
219	mov	%rdx, w3
220	adc	$0, w3
221	jmp	L(L3)
222
223L(c01):	cmp	$1, n
224	jz	L(m1)
225	neg	n
226	add	$3, n
227	add	%rax, w2
228	mov	%rdx, w3
229	adc	$0, w3
230	jmp	L(top)
231
232	ALIGN(32)
233L(m1):	add	%rax, w2
234	mov	%rdx, %rax
235	mov	w2, -8(rp)
236	adc	$0, %rax
237	pop	%rbx
238IFDOS(`	pop	%rdi		')
239IFDOS(`	pop	%rsi		')
240	ret
241EPILOGUE()
242