1dnl  AMD64 mpn_mul_2 optimised for Intel Haswell.
2
3dnl  Contributed to the GNU project by Torbj��rn Granlund.
4
5dnl  Copyright 2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb
36C AMD K8,K9	 -
37C AMD K10	 -
38C AMD bull	 -
39C AMD pile	 -
40C AMD steam	 -
41C AMD excavator	 -
42C AMD bobcat	 -
43C AMD jaguar	 -
44C Intel P4	 -
45C Intel core	 -
46C Intel NHM	 -
47C Intel SBR	 -
48C Intel IBR	 -
49C Intel HWL      3.74
50C Intel BWL      4.21
51C Intel SKL      4.20
52C Intel atom	 -
53C Intel SLM	 -
54C VIA nano	 -
55
56C The loop of this code is the result of running a code generation and
57C optimisation tool suite written by David Harvey and Torbj��rn Granlund.
58
59C TODO
60C  * Move test and jcc together, for insn fusion.
61
62define(`rp',     `%rdi')
63define(`up',     `%rsi')
64define(`n_param',`%rdx')
65define(`vp',     `%rcx')
66
67define(`v0', `%r8')
68define(`v1', `%r9')
69define(`w0', `%rbx')
70define(`w1', `%rcx')
71define(`w2', `%rbp')
72define(`w3', `%r10')
73define(`n',  `%r11')
74
75ABI_SUPPORT(DOS64)
76ABI_SUPPORT(STD64)
77
78ASM_START()
79	TEXT
80	ALIGN(32)
81PROLOGUE(mpn_mul_2)
82	FUNC_ENTRY(4)
83	push	%rbx
84	push	%rbp
85
86	mov	(vp), v0
87	mov	8(vp), v1
88
89	lea	3(n_param), n
90	shr	$2, n
91
92	test	$1, R8(n_param)
93	jnz	L(bx1)
94
95L(bx0):	xor	w0, w0
96	test	$2, R8(n_param)
97	mov	(up), %rdx
98	mulx(	v0, w2, w1)
99	jz	L(lo0)
100
101L(b10):	lea	-16(rp), rp
102	lea	-16(up), up
103	jmp	L(lo2)
104
105L(bx1):	xor	w2, w2
106	test	$2, R8(n_param)
107	mov	(up), %rdx
108	mulx(	v0, w0, w3)
109	jnz	L(b11)
110
111L(b01):	lea	-24(rp), rp
112	lea	8(up), up
113	jmp	L(lo1)
114
115L(b11):	lea	-8(rp), rp
116	lea	-8(up), up
117	jmp	L(lo3)
118
119	ALIGN(16)
120L(top):	mulx(	v1, %rax, w0)
121	add	%rax, w2		C 0
122	mov	(up), %rdx
123	mulx(	v0, %rax, w1)
124	adc	$0, w0			C 1
125	add	%rax, w2		C 0
126	adc	$0, w1			C 1
127	add	w3, w2			C 0
128L(lo0):	mov	w2, (rp)		C 0
129	adc	$0, w1			C 1
130	mulx(	v1, %rax, w2)
131	add	%rax, w0		C 1
132	mov	8(up), %rdx
133	adc	$0, w2			C 2
134	mulx(	v0, %rax, w3)
135	add	%rax, w0		C 1
136	adc	$0, w3			C 2
137	add	w1, w0			C 1
138L(lo3):	mov	w0, 8(rp)		C 1
139	adc	$0, w3			C 2
140	mulx(	v1, %rax, w0)
141	add	%rax, w2		C 2
142	mov	16(up), %rdx
143	mulx(	v0, %rax, w1)
144	adc	$0, w0			C 3
145	add	%rax, w2		C 2
146	adc	$0, w1			C 3
147	add	w3, w2			C 2
148L(lo2):	mov	w2, 16(rp)		C 2
149	adc	$0, w1			C 3
150	mulx(	v1, %rax, w2)
151	add	%rax, w0		C 3
152	mov	24(up), %rdx
153	adc	$0, w2			C 4
154	mulx(	v0, %rax, w3)
155	add	%rax, w0		C 3
156	adc	$0, w3			C 4
157	add	w1, w0			C 3
158	lea	32(up), up
159L(lo1):	mov	w0, 24(rp)		C 3
160	adc	$0, w3			C 4
161	dec	n
162	lea	32(rp), rp
163	jnz	L(top)
164
165L(end):	mulx(	v1, %rdx, %rax)
166	add	%rdx, w2
167	adc	$0, %rax
168	add	w3, w2
169	mov	w2, (rp)
170	adc	$0, %rax
171
172	pop	%rbp
173	pop	%rbx
174	FUNC_EXIT()
175	ret
176EPILOGUE()
177