1dnl  AMD64 mpn_addmul_1 optimised for Intel Broadwell.
2
3dnl  Copyright 2015, 2017 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb
34C AMD K8,K9	n/a
35C AMD K10	n/a
36C AMD bd1	n/a
37C AMD bd2	n/a
38C AMD bd3	n/a
39C AMD bd4	 ?
40C AMD zen	 ?
41C AMD bt1	n/a
42C AMD bt2	n/a
43C Intel P4	n/a
44C Intel PNR	n/a
45C Intel NHM	n/a
46C Intel SBR	n/a
47C Intel IBR	n/a
48C Intel HWL	n/a
49C Intel BWL	 1.67	 1.74
50C Intel SKL	 1.63	 1.71
51C Intel atom	n/a
52C Intel SLM	n/a
53C VIA nano	n/a
54
55C The loop of this code is the result of running a code generation and
56C optimisation tool suite written by David Harvey and Torbjorn Granlund.
57
58C TODO
59C  * Put an initial mulx before switching, targeting some free registers.
60C  * Tune feed-in code.
61C  * Trim nop execution after L(f2).
62C  * For DOS64, fix nop execution.
63
64define(`rp',      `%rdi')   C rcx
65define(`up',      `%rsi')   C rdx
66define(`n_param', `%rdx')   C r8
67define(`v0_param',`%rcx')   C r9
68
69define(`n',       `%rcx')
70
71ABI_SUPPORT(DOS64)
72ABI_SUPPORT(STD64)
73
74dnl IFDOS(`	define(`up', ``%rsi'')	') dnl
75dnl IFDOS(`	define(`rp', ``%rcx'')	') dnl
76dnl IFDOS(`	define(`vl', ``%r9'')	') dnl
77dnl IFDOS(`	define(`r9', ``rdi'')	') dnl
78dnl IFDOS(`	define(`n',  ``%r8'')	') dnl
79dnl IFDOS(`	define(`r8', ``r11'')	') dnl
80
81ASM_START()
82	TEXT
83	ALIGN(32)
84PROLOGUE(mpn_addmul_1)
85	FUNC_ENTRY(4)
86
87	mov	v0_param, %r10
88	mov	n_param, n
89	mov	R32(n_param), R32(%r8)
90	shr	$3, n
91	and	$7, R32(%r8)		C clear OF, CF as side-effect
92	mov	%r10, %rdx
93	lea	L(tab)(%rip), %r10
94ifdef(`PIC',
95`	movslq	(%r10,%r8,4), %r8
96	lea	(%r8, %r10), %r10
97	jmp	*%r10
98',`
99	jmp	*(%r10,%r8,8)
100')
101	JUMPTABSECT
102	ALIGN(8)
103L(tab):	JMPENT(	L(f0), L(tab))
104	JMPENT(	L(f1), L(tab))
105	JMPENT(	L(f2), L(tab))
106	JMPENT(	L(f3), L(tab))
107	JMPENT(	L(f4), L(tab))
108	JMPENT(	L(f5), L(tab))
109	JMPENT(	L(f6), L(tab))
110	JMPENT(	L(f7), L(tab))
111	TEXT
112
113L(f0):	mulx(	(up), %r10, %r8)
114	lea	-8(up), up
115	lea	-8(rp), rp
116	lea	-1(n), n
117	jmp	L(b0)
118
119L(f3):	mulx(	(up), %r9, %rax)
120	lea	16(up), up
121	lea	-48(rp), rp
122	jmp	L(b3)
123
124L(f4):	mulx(	(up), %r10, %r8)
125	lea	24(up), up
126	lea	-40(rp), rp
127	jmp	L(b4)
128
129L(f5):	mulx(	(up), %r9, %rax)
130	lea	32(up), up
131	lea	-32(rp), rp
132	jmp	L(b5)
133
134L(f6):	mulx(	(up), %r10, %r8)
135	lea	40(up), up
136	lea	-24(rp), rp
137	jmp	L(b6)
138
139L(f1):	mulx(	(up), %r9, %rax)
140	jrcxz	L(1)
141	jmp	L(b1)
142L(1):	add	(rp), %r9
143	mov	%r9, (rp)
144	adc	%rcx, %rax		C relies on rcx = 0
145	FUNC_EXIT()
146	ret
147
148L(end):	adox(	(rp), %r9)
149	mov	%r9, (rp)
150	adox(	%rcx, %rax)		C relies on rcx = 0
151	adc	%rcx, %rax		C relies on rcx = 0
152	FUNC_EXIT()
153	ret
154
155ifdef(`PIC',
156`	nop;nop;nop;nop',
157`	nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop')
158
159L(f2):	mulx(	(up), %r10, %r8)
160	lea	8(up), up
161	lea	8(rp), rp
162	mulx(	(up), %r9, %rax)
163
164	ALIGN(32)
165L(top):	adox(	-8,(rp), %r10)
166	adcx(	%r8, %r9)
167	mov	%r10, -8(rp)
168	jrcxz	L(end)
169L(b1):	mulx(	8,(up), %r10, %r8)
170	adox(	(rp), %r9)
171	lea	-1(n), n
172	mov	%r9, (rp)
173	adcx(	%rax, %r10)
174L(b0):	mulx(	16,(up), %r9, %rax)
175	adcx(	%r8, %r9)
176	adox(	8,(rp), %r10)
177	mov	%r10, 8(rp)
178L(b7):	mulx(	24,(up), %r10, %r8)
179	lea	64(up), up
180	adcx(	%rax, %r10)
181	adox(	16,(rp), %r9)
182	mov	%r9, 16(rp)
183L(b6):	mulx(	-32,(up), %r9, %rax)
184	adox(	24,(rp), %r10)
185	adcx(	%r8, %r9)
186	mov	%r10, 24(rp)
187L(b5):	mulx(	-24,(up), %r10, %r8)
188	adcx(	%rax, %r10)
189	adox(	32,(rp), %r9)
190	mov	%r9, 32(rp)
191L(b4):	mulx(	-16,(up), %r9, %rax)
192	adox(	40,(rp), %r10)
193	adcx(	%r8, %r9)
194	mov	%r10, 40(rp)
195L(b3):	adox(	48,(rp), %r9)
196	mulx(	-8,(up), %r10, %r8)
197	mov	%r9, 48(rp)
198	lea	64(rp), rp
199	adcx(	%rax, %r10)
200	mulx(	(up), %r9, %rax)
201	jmp	L(top)
202
203L(f7):	mulx(	(up), %r9, %rax)
204	lea	-16(up), up
205	lea	-16(rp), rp
206	jmp	L(b7)
207EPILOGUE()
208ASM_END()
209