1dnl  AMD64 logops.
2
3dnl  Copyright 2004-2017 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C		c/l	c/l	c/l	good
35C	       var-1   var-2   var-3  for cpu?
36C AMD K8,K9
37C AMD K10	 1.52	 1.75	 1.75	 n
38C AMD bd1
39C AMD bd2
40C AMD bd3
41C AMD bd4
42C AMD bt1	 2.67	~2.79	~2.79	 =
43C AMD bt2	 2.15	 2.65	 2.65	 n
44C AMD zen	 1.5	 1.5	 1.5	 =
45C Intel P4
46C Intel PNR	 2.0	 2.0	 2.0	 =
47C Intel NHM	 2.0	 2.0	 2.0	 =
48C Intel SBR	 1.5	 1.5	 1.5	 y
49C Intel IBR	 1.47	 1.48	 1.48	 y
50C Intel HWL	 1.11	 1.35	 1.35	 y
51C Intel BWL	 1.09	 1.30	 1.30	 y
52C Intel SKL	 1.21	 1.27	 1.27	 y
53C Intel atom	 3.31	 3.57	 3.57	 y
54C Intel SLM	 3.0	 3.0	 3.0	 =
55C VIA nano
56
57ifdef(`OPERATION_and_n',`
58  define(`func',`mpn_and_n')
59  define(`VARIANT_1')
60  define(`LOGOP',`and')')
61ifdef(`OPERATION_andn_n',`
62  define(`func',`mpn_andn_n')
63  define(`VARIANT_2')
64  define(`LOGOP',`and')')
65ifdef(`OPERATION_nand_n',`
66  define(`func',`mpn_nand_n')
67  define(`VARIANT_3')
68  define(`LOGOP',`and')')
69ifdef(`OPERATION_ior_n',`
70  define(`func',`mpn_ior_n')
71  define(`VARIANT_1')
72  define(`LOGOP',`or')')
73ifdef(`OPERATION_iorn_n',`
74  define(`func',`mpn_iorn_n')
75  define(`VARIANT_2')
76  define(`LOGOP',`or')')
77ifdef(`OPERATION_nior_n',`
78  define(`func',`mpn_nior_n')
79  define(`VARIANT_3')
80  define(`LOGOP',`or')')
81ifdef(`OPERATION_xor_n',`
82  define(`func',`mpn_xor_n')
83  define(`VARIANT_1')
84  define(`LOGOP',`xor')')
85ifdef(`OPERATION_xnor_n',`
86  define(`func',`mpn_xnor_n')
87  define(`VARIANT_2')
88  define(`LOGOP',`xor')')
89
90define(`addptr', `lea	$1($2), $2')
91
92MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
93
94C INPUT PARAMETERS
95define(`rp',`%rdi')
96define(`up',`%rsi')
97define(`vp',`%rdx')
98define(`n',`%rcx')
99
100ABI_SUPPORT(DOS64)
101ABI_SUPPORT(STD64)
102
103ASM_START()
104
105ifdef(`VARIANT_1',`
106	TEXT
107	ALIGN(32)
108PROLOGUE(func)
109	FUNC_ENTRY(4)
110	mov	(vp), %r8
111	mov	R32(%rcx), R32(%rax)
112	and	$3, R32(%rax)
113	je	L(b00)
114	cmp	$2, R32(%rax)
115	jc	L(b01)
116	je	L(b10)
117
118L(b11):	LOGOP	(up), %r8
119	mov	%r8, (rp)
120	inc	n
121	addptr(	-8, up)
122	addptr(	-8, vp)
123	addptr(	-8, rp)
124	jmp	L(e11)
125L(b10):	add	$2, n
126	addptr(	-16, up)
127	addptr(	-16, vp)
128	addptr(	-16, rp)
129	jmp	L(e10)
130L(b01):	LOGOP	(up), %r8
131	mov	%r8, (rp)
132	dec	n
133	jz	L(ret)
134	addptr(	8, up)
135	addptr(	8, vp)
136	addptr(	8, rp)
137
138	ALIGN(16)
139L(top):	mov	(vp), %r8
140L(b00):	mov	8(vp), %r9
141	LOGOP	(up), %r8
142	LOGOP	8(up), %r9
143	mov	%r8, (rp)
144	mov	%r9, 8(rp)
145L(e11):	mov	16(vp), %r8
146L(e10):	mov	24(vp), %r9
147	addptr(	32, vp)
148	LOGOP	16(up), %r8
149	LOGOP	24(up), %r9
150	addptr(	32, up)
151	mov	%r8, 16(rp)
152	mov	%r9, 24(rp)
153	addptr(	32, rp)
154	sub	$4, n
155	jnz	L(top)
156
157L(ret):	FUNC_EXIT()
158	ret
159EPILOGUE()
160')
161
162ifdef(`VARIANT_2',`
163	TEXT
164	ALIGN(32)
165PROLOGUE(func)
166	FUNC_ENTRY(4)
167	mov	(vp), %r8
168	not	%r8
169	mov	R32(%rcx), R32(%rax)
170	and	$3, R32(%rax)
171	je	L(b00)
172	cmp	$2, R32(%rax)
173	jc	L(b01)
174	je	L(b10)
175
176L(b11):	LOGOP	(up), %r8
177	mov	%r8, (rp)
178	inc	n
179	addptr(	-8, up)
180	addptr(	-8, vp)
181	addptr(	-8, rp)
182	jmp	L(e11)
183L(b10):	add	$2, n
184	addptr(	-16, up)
185	addptr(	-16, vp)
186	addptr(	-16, rp)
187	jmp	L(e10)
188L(b01):	LOGOP	(up), %r8
189	mov	%r8, (rp)
190	dec	n
191	jz	L(ret)
192	addptr(	8, up)
193	addptr(	8, vp)
194	addptr(	8, rp)
195
196	ALIGN(16)
197L(top):	mov	(vp), %r8
198	not	%r8
199L(b00):	mov	8(vp), %r9
200	not	%r9
201	LOGOP	(up), %r8
202	LOGOP	8(up), %r9
203	mov	%r8, (rp)
204	mov	%r9, 8(rp)
205L(e11):	mov	16(vp), %r8
206	not	%r8
207L(e10):	mov	24(vp), %r9
208	not	%r9
209	addptr(	32, vp)
210	LOGOP	16(up), %r8
211	LOGOP	24(up), %r9
212	addptr(	32, up)
213	mov	%r8, 16(rp)
214	mov	%r9, 24(rp)
215	addptr(	32, rp)
216	sub	$4, n
217	jnz	L(top)
218
219L(ret):	FUNC_EXIT()
220	ret
221EPILOGUE()
222')
223
224ifdef(`VARIANT_3',`
225	TEXT
226	ALIGN(32)
227PROLOGUE(func)
228	FUNC_ENTRY(4)
229	mov	(vp), %r8
230	mov	R32(%rcx), R32(%rax)
231	and	$3, R32(%rax)
232	je	L(b00)
233	cmp	$2, R32(%rax)
234	jc	L(b01)
235	je	L(b10)
236
237L(b11):	LOGOP	(up), %r8
238	not	%r8
239	mov	%r8, (rp)
240	inc	n
241	addptr(	-8, up)
242	addptr(	-8, vp)
243	addptr(	-8, rp)
244	jmp	L(e11)
245L(b10):	add	$2, n
246	addptr(	-16, up)
247	addptr(	-16, vp)
248	addptr(	-16, rp)
249	jmp	L(e10)
250L(b01):	LOGOP	(up), %r8
251	not	%r8
252	mov	%r8, (rp)
253	dec	n
254	jz	L(ret)
255	addptr(	8, up)
256	addptr(	8, vp)
257	addptr(	8, rp)
258
259	ALIGN(16)
260L(top):	mov	(vp), %r8
261L(b00):	mov	8(vp), %r9
262	LOGOP	(up), %r8
263	not	%r8
264	LOGOP	8(up), %r9
265	not	%r9
266	mov	%r8, (rp)
267	mov	%r9, 8(rp)
268L(e11):	mov	16(vp), %r8
269L(e10):	mov	24(vp), %r9
270	addptr(	32, vp)
271	LOGOP	16(up), %r8
272	not	%r8
273	LOGOP	24(up), %r9
274	addptr(	32, up)
275	not	%r9
276	mov	%r8, 16(rp)
277	mov	%r9, 24(rp)
278	addptr(	32, rp)
279	sub	$4, n
280	jnz	L(top)
281
282L(ret):	FUNC_EXIT()
283	ret
284EPILOGUE()
285')
286