1dnl  AMD64 mpn_mul_1 optimised for Intel Broadwell.
2
3dnl  Copyright 2015 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb
34C AMD K8,K9      -
35C AMD K10        -
36C AMD bull       -
37C AMD pile       -
38C AMD steam      -
39C AMD excavator  -
40C AMD bobcat     -
41C AMD jaguar     -
42C Intel P4       -
43C Intel core2    -
44C Intel NHM      -
45C Intel SBR      -
46C Intel IBR      -
47C Intel HWL      1.70
48C Intel BWL      1.51
49C Intel SKL      1.52
50C Intel atom     -
51C Intel SLM      -
52C VIA nano       -
53
54C The loop of this code is the result of running a code generation and
55C optimisation tool suite written by David Harvey and Torbjorn Granlund.
56
57C TODO
58C  * Put an initial mulx before switching, targeting some free registers.
59C  * Tune feed-in code.
60C  * Trim nop execution after L(f2).
61C  * Port to DOS64, not forgetting nop execution.
62
63define(`rp',      `%rdi')   C rcx
64define(`up',      `%rsi')   C rdx
65define(`n_param', `%rdx')   C r8
66define(`v0_param',`%rcx')   C r9
67
68define(`n',       `%rcx')
69
70dnl ABI_SUPPORT(DOS64)
71ABI_SUPPORT(STD64)
72
73dnl IFDOS(`	define(`up', ``%rsi'')	') dnl
74dnl IFDOS(`	define(`rp', ``%rcx'')	') dnl
75dnl IFDOS(`	define(`vl', ``%r9'')	') dnl
76dnl IFDOS(`	define(`r9', ``rdi'')	') dnl
77dnl IFDOS(`	define(`n',  ``%r8'')	') dnl
78dnl IFDOS(`	define(`r8', ``r11'')	') dnl
79
80ASM_START()
81	TEXT
82	ALIGN(32)
83PROLOGUE(mpn_mul_1)
84
85	mov	v0_param, %r10
86	mov	n_param, n
87	mov	R32(n_param), R32(%r8)
88	shr	$3, n
89	and	$7, R32(%r8)		C clear OF, CF as side-effect
90	mov	%r10, %rdx
91	lea	L(tab)(%rip), %r10
92ifdef(`PIC',
93`	movslq	(%r10,%r8,4), %r8
94	lea	(%r8, %r10), %r10
95	jmp	*%r10
96',`
97	jmp	*(%r10,%r8,8)
98')
99	JUMPTABSECT
100	ALIGN(8)
101L(tab):	JMPENT(	L(f0), L(tab))
102	JMPENT(	L(f1), L(tab))
103	JMPENT(	L(f2), L(tab))
104	JMPENT(	L(f3), L(tab))
105	JMPENT(	L(f4), L(tab))
106	JMPENT(	L(f5), L(tab))
107	JMPENT(	L(f6), L(tab))
108	JMPENT(	L(f7), L(tab))
109	TEXT
110
111L(f0):	mulx(	(up), %r10, %r8)
112	lea	56(up), up
113	lea	-8(rp), rp
114	jmp	L(b0)
115
116L(f3):	mulx(	(up), %r9, %rax)
117	lea	16(up), up
118	lea	16(rp), rp
119	inc	n
120	jmp	L(b3)
121
122L(f4):	mulx(	(up), %r10, %r8)
123	lea	24(up), up
124	lea	24(rp), rp
125	inc	n
126	jmp	L(b4)
127
128L(f5):	mulx(	(up), %r9, %rax)
129	lea	32(up), up
130	lea	32(rp), rp
131	inc	n
132	jmp	L(b5)
133
134L(f6):	mulx(	(up), %r10, %r8)
135	lea	40(up), up
136	lea	40(rp), rp
137	inc	n
138	jmp	L(b6)
139
140L(f7):	mulx(	(up), %r9, %rax)
141	lea	48(up), up
142	lea	48(rp), rp
143	inc	n
144	jmp	L(b7)
145
146L(f1):	mulx(	(up), %r9, %rax)
147	test	n, n
148	jnz	L(b1)
149L(1):	mov	%r9, (rp)
150	ret
151
152L(f2):	mulx(	(up), %r10, %r8)
153	lea	8(up), up
154	lea	8(rp), rp
155	mulx(	(up), %r9, %rax)
156	test	n, n
157	jz	L(end)
158
159	ALIGN(32)
160L(top):	mov	%r10, -8(rp)
161	adc	%r8, %r9
162L(b1):	mulx(	8,(up), %r10, %r8)
163	adc	%rax, %r10
164	lea	64(up), up
165	mov	%r9, (rp)
166L(b0):	mov	%r10, 8(rp)
167	mulx(	-48,(up), %r9, %rax)
168	lea	64(rp), rp
169	adc	%r8, %r9
170L(b7):	mulx(	-40,(up), %r10, %r8)
171	mov	%r9, -48(rp)
172	adc	%rax, %r10
173L(b6):	mov	%r10, -40(rp)
174	mulx(	-32,(up), %r9, %rax)
175	adc	%r8, %r9
176L(b5):	mulx(	-24,(up), %r10, %r8)
177	mov	%r9, -32(rp)
178	adc	%rax, %r10
179L(b4):	mulx(	-16,(up), %r9, %rax)
180	mov	%r10, -24(rp)
181	adc	%r8, %r9
182L(b3):	mulx(	-8,(up), %r10, %r8)
183	adc	%rax, %r10
184	mov	%r9, -16(rp)
185	dec	n
186	mulx(	(up), %r9, %rax)
187	jnz	L(top)
188
189L(end):	mov	%r10, -8(rp)
190	adc	%r8, %r9
191	mov	%r9, (rp)
192	adc	%rcx, %rax
193	ret
194EPILOGUE()
195ASM_END()
196