1dnl  X64-64 mpn_mullo_basecase optimised for Intel Broadwell.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2017 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C The inner loops of this code are the result of running a code generation and
36C optimisation tool suite written by David Harvey and Torbjorn Granlund.
37
38define(`rp',	   `%rdi')
39define(`up',	   `%rsi')
40define(`vp_param', `%rdx')
41define(`n',	   `%rcx')
42
43define(`vp',	`%r11')
44define(`jmpreg',`%rbx')
45define(`nn',    `%rbp')
46
47C TODO
48C  * Suppress more rp[] rewrites in corner.
49C  * Rearrange feed-in jumps for short branch forms.
50C  * Perhaps roll out the heavy artillery and 8-way unroll outer loop.  Since
51C    feed-in code implodes, the blow-up will not be more than perhaps 4x.
52C  * Micro-optimise critical lead-in code block around L(ent).
53C  * Write n < 4 code specifically for Broadwell (current code is for Haswell).
54
55ABI_SUPPORT(DOS64)
56ABI_SUPPORT(STD64)
57
58ASM_START()
59	TEXT
60	ALIGN(32)
61PROLOGUE(mpn_mullo_basecase)
62	FUNC_ENTRY(4)
63	cmp	$4, R32(n)
64	jae	L(big)
65
66	mov	vp_param, vp
67	mov	(up), %rdx
68
69	cmp	$2, R32(n)
70	jae	L(gt1)
71L(n1):	imul	(vp), %rdx
72	mov	%rdx, (rp)
73	FUNC_EXIT()
74	ret
75L(gt1):	ja	L(gt2)
76L(n2):	mov	(vp), %r9
77	mulx(	%r9, %rax, %rdx)
78	mov	%rax, (rp)
79	mov	8(up), %rax
80	imul	%r9, %rax
81	add	%rax, %rdx
82	mov	8(vp), %r9
83	mov	(up), %rcx
84	imul	%r9, %rcx
85	add	%rcx, %rdx
86	mov	%rdx, 8(rp)
87	FUNC_EXIT()
88	ret
89L(gt2):
90L(n3):	mov	(vp), %r9
91	mulx(	%r9, %rax, %r10)	C u0 x v0
92	mov	%rax, (rp)
93	mov	8(up), %rdx
94	mulx(	%r9, %rax, %rdx)	C u1 x v0
95	imul	16(up), %r9		C u2 x v0
96	add	%rax, %r10
97	adc	%rdx, %r9
98	mov	8(vp), %r8
99	mov	(up), %rdx
100	mulx(	%r8, %rax, %rdx)	C u0 x v1
101	add	%rax, %r10
102	adc	%rdx, %r9
103	imul	8(up), %r8		C u1 x v1
104	add	%r8, %r9
105	mov	%r10, 8(rp)
106	mov	16(vp), %r10
107	mov	(up), %rax
108	imul	%rax, %r10		C u0 x v2
109	add	%r10, %r9
110	mov	%r9, 16(rp)
111	FUNC_EXIT()
112	ret
113
114	ALIGN(16)
115L(big):	push	%r14
116	push	%r12
117	push	%rbx
118	push	%rbp
119	mov	-8(vp_param,n,8), %r14	C FIXME Put at absolute end
120	imul	(up), %r14		C FIXME Put at absolute end
121	lea	-3(n), R32(nn)
122	lea	8(vp_param), vp
123	mov	(vp_param), %rdx
124
125	mov	R32(n), R32(%rax)
126	shr	$3, R32(n)
127	and	$7, R32(%rax)		C clear OF, CF as side-effect
128	lea	L(mtab)(%rip), %r10
129ifdef(`PIC',
130`	movslq	(%r10,%rax,4), %rax
131	lea	(%rax, %r10), %r10
132	jmp	*%r10
133',`
134	jmp	*(%r10,%rax,8)
135')
136
137L(mf0):	mulx(	(up), %r10, %r8)
138	lea	56(up), up
139	lea	-8(rp), rp
140	lea	L(f7)(%rip), jmpreg
141	jmp	L(mb0)
142
143L(mf3):	mulx(	(up), %r9, %rax)
144	lea	16(up), up
145	lea	16(rp), rp
146	jrcxz	L(mc)
147	inc	R32(n)
148	lea	L(f2)(%rip), jmpreg
149	jmp	L(mb3)
150
151L(mc):	mulx(	-8,(up), %r10, %r8)
152	add	%rax, %r10
153	mov	%r9, -16(rp)
154	mulx(	(up), %r9, %rax)
155	mov	%r10, -8(rp)
156	adc	%r8, %r9
157	mov	%r9, (rp)
158	jmp	L(c2)
159
160L(mf4):	mulx(	(up), %r10, %r8)
161	lea	24(up), up
162	lea	24(rp), rp
163	inc	R32(n)
164	lea	L(f3)(%rip), jmpreg
165	jmp	L(mb4)
166
167L(mf5):	mulx(	(up), %r9, %rax)
168	lea	32(up), up
169	lea	32(rp), rp
170	inc	R32(n)
171	lea	L(f4)(%rip), jmpreg
172	jmp	L(mb5)
173
174L(mf6):	mulx(	(up), %r10, %r8)
175	lea	40(up), up
176	lea	40(rp), rp
177	inc	R32(n)
178	lea	L(f5)(%rip), jmpreg
179	jmp	L(mb6)
180
181L(mf7):	mulx(	(up), %r9, %rax)
182	lea	48(up), up
183	lea	48(rp), rp
184	lea	L(f6)(%rip), jmpreg
185	jmp	L(mb7)
186
187L(mf1):	mulx(	(up), %r9, %rax)
188	lea	L(f0)(%rip), jmpreg
189	jmp	L(mb1)
190
191L(mf2):	mulx(	(up), %r10, %r8)
192	lea	8(up), up
193	lea	8(rp), rp
194	lea	L(f1)(%rip), jmpreg
195	mulx(	(up), %r9, %rax)
196
197C FIXME ugly fallthrough FIXME
198	ALIGN(32)
199L(mtop):mov	%r10, -8(rp)
200	adc	%r8, %r9
201L(mb1):	mulx(	8,(up), %r10, %r8)
202	adc	%rax, %r10
203	lea	64(up), up
204	mov	%r9, (rp)
205L(mb0):	mov	%r10, 8(rp)
206	mulx(	-48,(up), %r9, %rax)
207	lea	64(rp), rp
208	adc	%r8, %r9
209L(mb7):	mulx(	-40,(up), %r10, %r8)
210	mov	%r9, -48(rp)
211	adc	%rax, %r10
212L(mb6):	mov	%r10, -40(rp)
213	mulx(	-32,(up), %r9, %rax)
214	adc	%r8, %r9
215L(mb5):	mulx(	-24,(up), %r10, %r8)
216	mov	%r9, -32(rp)
217	adc	%rax, %r10
218L(mb4):	mulx(	-16,(up), %r9, %rax)
219	mov	%r10, -24(rp)
220	adc	%r8, %r9
221L(mb3):	mulx(	-8,(up), %r10, %r8)
222	adc	%rax, %r10
223	mov	%r9, -16(rp)
224	dec	R32(n)
225	mulx(	(up), %r9, %rax)
226	jnz	L(mtop)
227
228L(mend):mov	%r10, -8(rp)
229	adc	%r8, %r9
230	mov	%r9, (rp)
231	adc	%rcx, %rax
232
233	lea	8(,nn,8), %r12
234	neg	%r12
235	shr	$3, R32(nn)
236	jmp	L(ent)
237
238L(f0):	mulx(	(up), %r10, %r8)
239	lea	-8(up), up
240	lea	-8(rp), rp
241	lea	L(f7)(%rip), jmpreg
242	jmp	L(b0)
243
244L(f1):	mulx(	(up), %r9, %rax)
245	lea	-1(nn), R32(nn)
246	lea	L(f0)(%rip), jmpreg
247	jmp	L(b1)
248
249L(end):	adox(	(rp), %r9)
250	mov	%r9, (rp)
251	adox(	%rcx, %rax)		C relies on rcx = 0
252	adc	%rcx, %rax		C FIXME suppress, use adc below; reqs ent path edits
253	lea	8(%r12), %r12
254L(ent):	mulx(	8,(up), %r10, %r8)	C r8 unused (use imul?)
255	add	%rax, %r14
256	add	%r10, %r14		C h
257	lea	(up,%r12), up		C reset up
258	lea	8(rp,%r12), rp		C reset rp
259	mov	(vp), %rdx
260	lea	8(vp), vp
261	or	R32(nn), R32(n)		C copy count, clear CF,OF (n = 0 prior)
262	jmp	*jmpreg
263
264L(f7):	mulx(	(up), %r9, %rax)
265	lea	-16(up), up
266	lea	-16(rp), rp
267	lea	L(f6)(%rip), jmpreg
268	jmp	L(b7)
269
270L(f2):	mulx(	(up), %r10, %r8)
271	lea	8(up), up
272	lea	8(rp), rp
273	mulx(	(up), %r9, %rax)
274	lea	L(f1)(%rip), jmpreg
275
276C FIXME ugly fallthrough FIXME
277	ALIGN(32)
278L(top):	adox(	-8,(rp), %r10)
279	adcx(	%r8, %r9)
280	mov	%r10, -8(rp)
281	jrcxz	L(end)
282L(b1):	mulx(	8,(up), %r10, %r8)
283	adox(	(rp), %r9)
284	lea	-1(n), R32(n)
285	mov	%r9, (rp)
286	adcx(	%rax, %r10)
287L(b0):	mulx(	16,(up), %r9, %rax)
288	adcx(	%r8, %r9)
289	adox(	8,(rp), %r10)
290	mov	%r10, 8(rp)
291L(b7):	mulx(	24,(up), %r10, %r8)
292	lea	64(up), up
293	adcx(	%rax, %r10)
294	adox(	16,(rp), %r9)
295	mov	%r9, 16(rp)
296L(b6):	mulx(	-32,(up), %r9, %rax)
297	adox(	24,(rp), %r10)
298	adcx(	%r8, %r9)
299	mov	%r10, 24(rp)
300L(b5):	mulx(	-24,(up), %r10, %r8)
301	adcx(	%rax, %r10)
302	adox(	32,(rp), %r9)
303	mov	%r9, 32(rp)
304L(b4):	mulx(	-16,(up), %r9, %rax)
305	adox(	40,(rp), %r10)
306	adcx(	%r8, %r9)
307	mov	%r10, 40(rp)
308L(b3):	adox(	48,(rp), %r9)
309	mulx(	-8,(up), %r10, %r8)
310	mov	%r9, 48(rp)
311	lea	64(rp), rp
312	adcx(	%rax, %r10)
313	mulx(	(up), %r9, %rax)
314	jmp	L(top)
315
316L(f6):	mulx(	(up), %r10, %r8)
317	lea	40(up), up
318	lea	-24(rp), rp
319	lea	L(f5)(%rip), jmpreg
320	jmp	L(b6)
321
322L(f5):	mulx(	(up), %r9, %rax)
323	lea	32(up), up
324	lea	-32(rp), rp
325	lea	L(f4)(%rip), jmpreg
326	jmp	L(b5)
327
328L(f4):	mulx(	(up), %r10, %r8)
329	lea	24(up), up
330	lea	-40(rp), rp
331	lea	L(f3)(%rip), jmpreg
332	jmp	L(b4)
333
334L(f3):	mulx(	(up), %r9, %rax)
335	lea	16(up), up
336	lea	-48(rp), rp
337	jrcxz	L(cor)
338	lea	L(f2)(%rip), jmpreg
339	jmp	L(b3)
340
341L(cor):	adox(	48,(rp), %r9)
342	mulx(	-8,(up), %r10, %r8)
343	mov	%r9, 48(rp)
344	lea	64(rp), rp
345	adcx(	%rax, %r10)
346	mulx(	(up), %r9, %rax)
347	adox(	-8,(rp), %r10)
348	adcx(	%r8, %r9)
349	mov	%r10, -8(rp)		C FIXME suppress
350	adox(	(rp), %r9)
351	mov	%r9, (rp)		C FIXME suppress
352	adox(	%rcx, %rax)
353L(c2):
354	mulx(	8,(up), %r10, %r8)
355	adc	%rax, %r14
356	add	%r10, %r14
357	mov	(vp), %rdx
358	test	R32(%rcx), R32(%rcx)
359	mulx(	-16,(up), %r10, %r8)
360	mulx(	-8,(up), %r9, %rax)
361	adox(	-8,(rp), %r10)
362	adcx(	%r8, %r9)
363	mov	%r10, -8(rp)
364	adox(	(rp), %r9)
365	adox(	%rcx, %rax)
366	adc	%rcx, %rax
367	mulx(	(up), %r10, %r8)
368	add	%rax, %r14
369	add	%r10, %r14
370	mov	8(vp), %rdx
371	mulx(	-16,(up), %rcx, %rax)
372	add	%r9, %rcx
373	mov	%rcx, (rp)
374	adc	$0, %rax
375	mulx(	-8,(up), %r10, %r8)
376	add	%rax, %r14
377	add	%r10, %r14
378	mov	%r14, 8(rp)
379	pop	%rbp
380	pop	%rbx
381	pop	%r12
382	pop	%r14
383	FUNC_EXIT()
384	ret
385EPILOGUE()
386	JUMPTABSECT
387	ALIGN(8)
388L(mtab):JMPENT(	L(mf7), L(mtab))
389	JMPENT(	L(mf0), L(mtab))
390	JMPENT(	L(mf1), L(mtab))
391	JMPENT(	L(mf2), L(mtab))
392	JMPENT(	L(mf3), L(mtab))
393	JMPENT(	L(mf4), L(mtab))
394	JMPENT(	L(mf5), L(mtab))
395	JMPENT(	L(mf6), L(mtab))
396