1dnl  X64-64 mpn_mullo_basecase optimised for AMD Zen.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2017 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C The inner loops of this code are the result of running a code generation and
36C optimisation tool suite written by David Harvey and Torbjorn Granlund.
37
38define(`rp',	   `%rdi')
39define(`up',	   `%rsi')
40define(`vp_param', `%rdx')
41define(`n',	   `%rcx')
42
43define(`vp',	`%r11')
44define(`nn',    `%rbp')
45
46C TODO
47C  * Rearrange feed-in jumps for short branch forms.
48C  * Roll out the heavy artillery and 4-way unroll outer loop.  Since feed-in
49C    code implodes, the blow-up will not be more than perhaps 2.5x.
50C  * Micro-optimise critical lead-in code blocks.
51C  * Clean up register use, e.g. r15 vs vp, disuse of nn, etc.
52C  * Write n < 4 code specifically for Zen (current code is for Haswell).
53
54ABI_SUPPORT(DOS64)
55ABI_SUPPORT(STD64)
56
57ASM_START()
58	TEXT
59	ALIGN(32)
60PROLOGUE(mpn_mullo_basecase)
61	FUNC_ENTRY(4)
62	cmp	$4, R32(n)
63	jae	L(big)
64
65	mov	vp_param, vp
66	mov	(up), %rdx
67
68	cmp	$2, R32(n)
69	jae	L(gt1)
70L(n1):	imul	(vp), %rdx
71	mov	%rdx, (rp)
72	FUNC_EXIT()
73	ret
74L(gt1):	ja	L(gt2)
75L(n2):	mov	(vp), %r9
76	mulx(	%r9, %rax, %rdx)
77	mov	%rax, (rp)
78	mov	8(up), %rax
79	imul	%r9, %rax
80	add	%rax, %rdx
81	mov	8(vp), %r9
82	mov	(up), %rcx
83	imul	%r9, %rcx
84	add	%rcx, %rdx
85	mov	%rdx, 8(rp)
86	FUNC_EXIT()
87	ret
88L(gt2):
89L(n3):	mov	(vp), %r9
90	mulx(	%r9, %rax, %r10)	C u0 x v0
91	mov	%rax, (rp)
92	mov	8(up), %rdx
93	mulx(	%r9, %rax, %rdx)	C u1 x v0
94	imul	16(up), %r9		C u2 x v0
95	add	%rax, %r10
96	adc	%rdx, %r9
97	mov	8(vp), %r8
98	mov	(up), %rdx
99	mulx(	%r8, %rax, %rdx)	C u0 x v1
100	add	%rax, %r10
101	adc	%rdx, %r9
102	imul	8(up), %r8		C u1 x v1
103	add	%r8, %r9
104	mov	%r10, 8(rp)
105	mov	16(vp), %r10
106	mov	(up), %rax
107	imul	%rax, %r10		C u0 x v2
108	add	%r10, %r9
109	mov	%r9, 16(rp)
110	FUNC_EXIT()
111	ret
112
113	ALIGN(16)
114L(big):	push	%r15
115	push	%r14
116	push	%r13
117	push	%r12
118	push	%rbp
119	push	%rbx
120
121	mov	(up), %r9
122	lea	-8(up,n,8), up
123	lea	-40(rp,n,8), rp
124
125	mov	$4, R32(%r14)
126	sub	n, %r14
127	mov	-8(vp_param,n,8), %rbp
128	imul	%r9, %rbp
129	lea	8(vp_param), %r15
130	mov	(vp_param), %rdx
131
132	test	$1, R8(%r14)
133	jnz	L(mx0)
134L(mx1):	test	$2, R8(%r14)
135	jz	L(mb3)
136
137L(mb1):	mulx(	%r9, %rbx, %rax)
138	lea	-2(%r14), n
139	.byte	0xc4,0x22,0xb3,0xf6,0x44,0xf6,0xf0	C mulx -0x10(%rsi,%r14,8),%r9,%r8
140	.byte	0xc4,0x22,0xa3,0xf6,0x54,0xf6,0xf8	C mulx -0x8(%rsi,%r14,8),%r11,%r10
141	jmp	L(mlo1)
142
143L(mb3):	mulx(	%r9, %r11, %r10)
144	.byte	0xc4,0x22,0x93,0xf6,0x64,0xf6,0xf0	C mulx -0x10(%rsi,%r14,8),%r13,%r12
145	.byte	0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0xf8	C mulx -0x8(%rsi,%r14,8),%rbx,%rax
146	lea	(%r14), n
147	jrcxz	L(x)
148	jmp	L(mlo3)
149L(x):	jmp	L(mcor)
150
151L(mb2):	mulx(	%r9, %r13, %r12)
152	.byte	0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0xf0	C mulx -0x10(%rsi,%r14,8),%rbx,%rax
153	lea	-1(%r14), n
154	.byte	0xc4,0x22,0xb3,0xf6,0x44,0xf6,0xf8	C mulx -0x8(%rsi,%r14,8),%r9,%r8
155	jmp	L(mlo2)
156
157L(mx0):	test	$2, R8(%r14)
158	jz	L(mb2)
159
160L(mb0):	mulx(	%r9, %r9, %r8)
161	.byte	0xc4,0x22,0xa3,0xf6,0x54,0xf6,0xf0	C mulx -0x10(%rsi,%r14,8),%r11,%r10
162	.byte	0xc4,0x22,0x93,0xf6,0x64,0xf6,0xf8	C mulx -0x8(%rsi,%r14,8),%r13,%r12
163	lea	-3(%r14), n
164	jmp	L(mlo0)
165
166	ALIGN(16)
167L(mtop):jrcxz	L(mend)
168	adc	%r8, %r11
169	mov	%r9, (rp,n,8)
170L(mlo3):.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
171	adc	%r10, %r13
172	mov	%r11, 8(rp,n,8)
173L(mlo2):.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
174	adc	%r12, %rbx
175	mov	%r13, 16(rp,n,8)
176L(mlo1):.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
177	adc	%rax, %r9
178	mov	%rbx, 24(rp,n,8)
179L(mlo0):.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(up,n,8), %rbx, %rax
180	lea	4(n), n
181	jmp	L(mtop)
182
183L(mend):mov	%r9, (rp)
184	adc	%r8, %r11
185	mov	%r11, 8(rp)
186	adc	%r10, %r13
187	mov	%r13, 16(rp)
188	adc	%r12, %rbx
189	mov	%rbx, 24(rp)
190
191L(outer):
192	mulx(	(up), %r10, %r8)	C FIXME r8 unused (use imul?)
193	adc	%rax, %rbp
194	add	%r10, %rbp
195	mov	(%r15), %rdx
196	add	$8, %r15
197	mov	-24(up,%r14,8), %r8
198	lea	-8(up), up
199
200	test	$1, R8(%r14)
201	jz	L(x0)
202L(x1):	test	$2, R8(%r14)
203	jnz	L(b3)
204
205L(b1):	mulx(	%r8, %rbx, %rax)
206	lea	-1(%r14), n
207	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (%rsi,%rcx,8),%r9,%r8
208	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 0x8(%rsi,%rcx,8),%r11,%r10
209	jmp	L(lo1)
210
211L(x0):	test	$2, R8(%r14)
212	jz	L(b2)
213
214L(b0):	mulx(	%r8, %r9, %r8)
215	lea	-2(%r14), n
216	.byte	0xc4,0x22,0xa3,0xf6,0x54,0xf6,0xf8	C mulx -0x8(%rsi,%r14,8),%r11,%r10
217	.byte	0xc4,0x22,0x93,0xf6,0x24,0xf6		C mulx (%rsi,%r14,8),%r13,%r12
218	jmp	L(lo0)
219
220L(b3):	mulx(	%r8, %r11, %r10)
221	lea	1(%r14), n
222	.byte	0xc4,0x22,0x93,0xf6,0x64,0xf6,0xf8	C mulx -0x8(%rsi,%r14,8),%r13,%r12
223	.byte	0xc4,0xa2,0xe3,0xf6,0x04,0xf6		C mulx (%rsi,%r14,8),%rbx,%rax
224	add	%r10, %r13
225	adc	%r12, %rbx
226	adc	$0, %rax
227	jrcxz	L(cor)
228	jmp	L(lo3)
229
230L(cor):	add	8(rp), %r11
231	mov	16(rp), %r10
232	mov	24(rp), %r12
233L(mcor):mov	%r11, 8(rp)
234	adc	%r10, %r13
235	adc	%r12, %rbx
236	mulx(	(up), %r10, %r8)	C FIXME r8 unused (use imul?)
237	adc	%rax, %rbp
238	add	%r10, %rbp
239	mov	(%r15), %rdx
240	mov	-24(up), %r8
241	mulx(	%r8, %r9, %r12)
242	mulx(	-16,(up), %r14, %rax)
243	add	%r12, %r14
244	adc	$0, %rax
245	adc	%r9, %r13
246	mov	%r13, 16(rp)
247	adc	%r14, %rbx
248	mulx(	-8,(up), %r10, %r8)	C FIXME r8 unused (use imul?)
249	adc	%rax, %rbp
250	add	%r10, %rbp
251	mov	8(%r15), %rdx
252	mulx(	-24,(up), %r14, %rax)
253	add	%r14, %rbx
254	mov	%rbx, 24(rp)
255	mulx(	-16,(up), %r10, %r8)	C FIXME r8 unused (use imul?)
256	adc	%rax, %rbp
257	add	%r10, %rbp
258	mov	%rbp, 32(rp)
259	pop	%rbx
260	pop	%rbp
261	pop	%r12
262	pop	%r13
263	pop	%r14
264	pop	%r15
265	FUNC_EXIT()
266	ret
267
268L(b2):	mulx(	%r8, %r13, %r12)
269	lea	(%r14), n
270	.byte	0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0xf8	C mulx -0x8(%rsi,%r14,8),%rbx,%rax
271	add	%r12, %rbx
272	adc	$0, %rax
273	.byte	0xc4,0x22,0xb3,0xf6,0x04,0xf6		C mulx (%rsi,%r14,8),%r9,%r8
274	jmp	L(lo2)
275
276	ALIGN(16)
277L(top):	add	%r9, (rp,n,8)
278L(lo3):	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
279	adc	%r11, 8(rp,n,8)
280L(lo2):	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
281	adc	%r13, 16(rp,n,8)
282L(lo1):	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
283	adc	%rbx, 24(rp,n,8)
284	adc	%rax, %r9
285L(lo0):	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(up,n,8), %rbx, %rax
286	adc	%r8, %r11
287	adc	%r10, %r13
288	adc	%r12, %rbx
289	adc	$0, %rax
290	add	$4, n
291	js	L(top)
292
293	add	%r9, (rp)
294	adc	%r11, 8(rp)
295	adc	%r13, 16(rp)
296	adc	%rbx, 24(rp)
297	inc	%r14
298	jmp	L(outer)
299EPILOGUE()
300