1dnl  AMD64 mpn_mullo_basecase optimised for Intel Haswell.
2
3dnl  Contributed to the GNU project by Torbj��rn Granlund.
4
5dnl  Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C cycles/limb	mul_2		addmul_2
36C AMD K8,K9	n/a		n/a
37C AMD K10	n/a		n/a
38C AMD bull	n/a		n/a
39C AMD pile	n/a		n/a
40C AMD steam	 ?		 ?
41C AMD bobcat	n/a		n/a
42C AMD jaguar	 ?		 ?
43C Intel P4	n/a		n/a
44C Intel core	n/a		n/a
45C Intel NHM	n/a		n/a
46C Intel SBR	n/a		n/a
47C Intel IBR	n/a		n/a
48C Intel HWL	 1.86		 2.15
49C Intel BWL	 ?		 ?
50C Intel atom	n/a		n/a
51C VIA nano	n/a		n/a
52
53C The inner loops of this code are the result of running a code generation and
54C optimisation tool suite written by David Harvey and Torbj��rn Granlund.
55
56C TODO
57C   * Implement proper cor2, replacing current cor0.
58C   * Micro-optimise.
59
60define(`rp',       `%rdi')
61define(`up',       `%rsi')
62define(`vp_param', `%rdx')
63define(`n',        `%rcx')
64
65define(`vp',       `%r8')
66define(`X0',       `%r14')
67define(`X1',       `%r15')
68
69define(`w0',       `%r10')
70define(`w1',       `%r11')
71define(`w2',       `%r12')
72define(`w3',       `%r13')
73define(`i',        `%rbp')
74define(`v0',       `%r9')
75define(`v1',       `%rbx')
76
77C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
78
79ABI_SUPPORT(DOS64)
80ABI_SUPPORT(STD64)
81
82ASM_START()
83	TEXT
84	ALIGN(32)
85PROLOGUE(mpn_mullo_basecase)
86	FUNC_ENTRY(4)
87
88	mov	vp_param, vp
89	mov	(up), %rdx
90
91	cmp	$4, n
92	jb	L(small)
93
94	push	%rbx
95	push	%rbp
96	push	%r12
97	push	%r13
98
99	mov	(vp), v0
100	mov	8(vp), v1
101
102	lea	2(n), i
103	shr	$2, i
104	neg	n
105	add	$2, n
106
107	push	up			C put entry `up' on stack
108
109	test	$1, R8(n)
110	jnz	L(m2x1)
111
112L(m2x0):mulx(	v0, w0, w3)
113	xor	R32(w2), R32(w2)
114	test	$2, R8(n)
115	jz	L(m2b2)
116
117L(m2b0):lea	-8(rp), rp
118	lea	-8(up), up
119	jmp	L(m2e0)
120
121L(m2b2):lea	-24(rp), rp
122	lea	8(up), up
123	jmp	L(m2e2)
124
125L(m2x1):mulx(	v0, w2, w1)
126	xor	R32(w0), R32(w0)
127	test	$2, R8(n)
128	jnz	L(m2b3)
129
130L(m2b1):jmp	L(m2e1)
131
132L(m2b3):lea	-16(rp), rp
133	lea	-16(up), up
134	jmp	L(m2e3)
135
136	ALIGN(16)
137L(m2tp):mulx(	v1, %rax, w0)
138	add	%rax, w2
139	mov	(up), %rdx
140	mulx(	v0, %rax, w1)
141	adc	$0, w0
142	add	%rax, w2
143	adc	$0, w1
144	add	w3, w2
145L(m2e1):mov	w2, (rp)
146	adc	$0, w1
147	mulx(	v1, %rax, w2)
148	add	%rax, w0
149	mov	8(up), %rdx
150	adc	$0, w2
151	mulx(	v0, %rax, w3)
152	add	%rax, w0
153	adc	$0, w3
154	add	w1, w0
155L(m2e0):mov	w0, 8(rp)
156	adc	$0, w3
157	mulx(	v1, %rax, w0)
158	add	%rax, w2
159	mov	16(up), %rdx
160	mulx(	v0, %rax, w1)
161	adc	$0, w0
162	add	%rax, w2
163	adc	$0, w1
164	add	w3, w2
165L(m2e3):mov	w2, 16(rp)
166	adc	$0, w1
167	mulx(	v1, %rax, w2)
168	add	%rax, w0
169	mov	24(up), %rdx
170	adc	$0, w2
171	mulx(	v0, %rax, w3)
172	add	%rax, w0
173	adc	$0, w3
174	add	w1, w0
175	lea	32(up), up
176L(m2e2):mov	w0, 24(rp)
177	adc	$0, w3
178	dec	i
179	lea	32(rp), rp
180	jnz	L(m2tp)
181
182L(m2ed):mulx(	v1, %rax, w0)
183	add	%rax, w2
184	mov	(up), %rdx
185	mulx(	v0, %rax, w1)
186	add	w2, %rax
187	add	w3, %rax
188	mov	%rax, (rp)
189
190	mov	(%rsp), up		C restore `up' to beginning
191	lea	16(vp), vp
192	lea	8(rp,n,8), rp		C put back rp to old rp + 2
193	add	$2, n
194	jge	L(cor1)
195
196	push	%r14
197	push	%r15
198
199L(outer):
200	mov	(vp), v0
201	mov	8(vp), v1
202
203	lea	(n), i
204	sar	$2, i
205
206	mov	(up), %rdx
207	test	$1, R8(n)
208	jnz	L(bx1)
209
210L(bx0):	mov	(rp), X1
211	mov	8(rp), X0
212	mulx(	v0, %rax, w3)
213	add	%rax, X1
214	adc	$0, w3
215	mulx(	v1, %rax, w0)
216	add	%rax, X0
217	adc	$0, w0
218	mov	8(up), %rdx
219	mov	X1, (rp)
220	mulx(	v0, %rax, w1)
221	test	$2, R8(n)
222	jz	L(b2)
223
224L(b0):	lea	8(rp), rp
225	lea	8(up), up
226	jmp	L(lo0)
227
228L(b2):	mov	16(rp), X1
229	lea	24(rp), rp
230	lea	24(up), up
231	jmp	L(lo2)
232
233L(bx1):	mov	(rp), X0
234	mov	8(rp), X1
235	mulx(	v0, %rax, w1)
236	add	%rax, X0
237	mulx(	v1, %rax, w2)
238	adc	$0, w1
239	mov	X0, (rp)
240	add	%rax, X1
241	adc	$0, w2
242	mov	8(up), %rdx
243	test	$2, R8(n)
244	jnz	L(b3)
245
246L(b1):	lea	16(up), up
247	lea	16(rp), rp
248	jmp	L(lo1)
249
250L(b3):	mov	16(rp), X0
251	lea	32(up), up
252	mulx(	v0, %rax, w3)
253	inc	i
254	jz	L(cj3)
255	jmp	L(lo3)
256
257	ALIGN(16)
258L(top):	mulx(	v0, %rax, w3)
259	add	w0, X1
260	adc	$0, w2
261L(lo3):	add	%rax, X1
262	adc	$0, w3
263	mulx(	v1, %rax, w0)
264	add	%rax, X0
265	adc	$0, w0
266	lea	32(rp), rp
267	add	w1, X1
268	mov	-16(up), %rdx
269	mov	X1, -24(rp)
270	adc	$0, w3
271	add	w2, X0
272	mov	-8(rp), X1
273	mulx(	v0, %rax, w1)
274	adc	$0, w0
275L(lo2):	add	%rax, X0
276	mulx(	v1, %rax, w2)
277	adc	$0, w1
278	add	w3, X0
279	mov	X0, -16(rp)
280	adc	$0, w1
281	add	%rax, X1
282	adc	$0, w2
283	add	w0, X1
284	mov	-8(up), %rdx
285	adc	$0, w2
286L(lo1):	mulx(	v0, %rax, w3)
287	add	%rax, X1
288	adc	$0, w3
289	mov	(rp), X0
290	mulx(	v1, %rax, w0)
291	add	%rax, X0
292	adc	$0, w0
293	add	w1, X1
294	mov	X1, -8(rp)
295	adc	$0, w3
296	mov	(up), %rdx
297	add	w2, X0
298	mulx(	v0, %rax, w1)
299	adc	$0, w0
300L(lo0):	add	%rax, X0
301	adc	$0, w1
302	mulx(	v1, %rax, w2)
303	add	w3, X0
304	mov	8(rp), X1
305	mov	X0, (rp)
306	mov	16(rp), X0
307	adc	$0, w1
308	add	%rax, X1
309	adc	$0, w2
310	mov	8(up), %rdx
311	lea	32(up), up
312	inc	i
313	jnz	L(top)
314
315L(end):	mulx(	v0, %rax, w3)
316	add	w0, X1
317	adc	$0, w2
318L(cj3):	add	%rax, X1
319	adc	$0, w3
320	mulx(	v1, %rax, w0)
321	add	%rax, X0
322	add	w1, X1
323	mov	-16(up), %rdx
324	mov	X1, 8(rp)
325	adc	$0, w3
326	add	w2, X0
327	mulx(	v0, %rax, w1)
328	add	X0, %rax
329	add	w3, %rax
330	mov	%rax, 16(rp)
331
332	mov	16(%rsp), up		C restore `up' to beginning
333	lea	16(vp), vp
334	lea	24(rp,n,8), rp		C put back rp to old rp + 2
335	add	$2, n
336	jl	L(outer)
337
338	pop	%r15
339	pop	%r14
340
341	jnz	L(cor0)
342
343L(cor1):mov	(vp), v0
344	mov	8(vp), v1
345	mov	(up), %rdx
346	mulx(	v0, %r12, %rbp)		C u0 x v2
347	add	(rp), %r12		C FIXME: rp[0] still available in reg?
348	adc	%rax, %rbp
349	mov	8(up), %r10
350	imul	v0, %r10
351	imul	v1, %rdx
352	mov	%r12, (rp)
353	add	%r10, %rdx
354	add	%rbp, %rdx
355	mov	%rdx, 8(rp)
356	pop	%rax			C deallocate `up' copy
357	pop	%r13
358	pop	%r12
359	pop	%rbp
360	pop	%rbx
361	FUNC_EXIT()
362	ret
363
364L(cor0):mov	(vp), %r11
365	imul	(up), %r11
366	add	%rax, %r11
367	mov	%r11, (rp)
368	pop	%rax			C deallocate `up' copy
369	pop	%r13
370	pop	%r12
371	pop	%rbp
372	pop	%rbx
373	FUNC_EXIT()
374	ret
375
376	ALIGN(16)
377L(small):
378	cmp	$2, n
379	jae	L(gt1)
380L(n1):	imul	(vp), %rdx
381	mov	%rdx, (rp)
382	FUNC_EXIT()
383	ret
384L(gt1):	ja	L(gt2)
385L(n2):	mov	(vp), %r9
386	mulx(	%r9, %rax, %rdx)
387	mov	%rax, (rp)
388	mov	8(up), %rax
389	imul	%r9, %rax
390	add	%rax, %rdx
391	mov	8(vp), %r9
392	mov	(up), %rcx
393	imul	%r9, %rcx
394	add	%rcx, %rdx
395	mov	%rdx, 8(rp)
396	FUNC_EXIT()
397	ret
398L(gt2):
399L(n3):	mov	(vp), %r9
400	mulx(	%r9, %rax, %r10)	C u0 x v0
401	mov	%rax, (rp)
402	mov	8(up), %rdx
403	mulx(	%r9, %rax, %rdx)	C u1 x v0
404	imul	16(up), %r9		C u2 x v0
405	add	%rax, %r10
406	adc	%rdx, %r9
407	mov	8(vp), %r11
408	mov	(up), %rdx
409	mulx(	%r11, %rax, %rdx)	C u0 x v1
410	add	%rax, %r10
411	adc	%rdx, %r9
412	imul	8(up), %r11		C u1 x v1
413	add	%r11, %r9
414	mov	%r10, 8(rp)
415	mov	16(vp), %r10
416	mov	(up), %rax
417	imul	%rax, %r10		C u0 x v2
418	add	%r10, %r9
419	mov	%r9, 16(rp)
420	FUNC_EXIT()
421	ret
422EPILOGUE()
423