1dnl  AMD64 logops.
2
3dnl  Copyright 2004-2017 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C		c/l	c/l	c/l	good
35C	       var-1   var-2   var-3  for cpu?
36C AMD K8,K9	 1.5	 1.5	 1.5	 y
37C AMD K10	 1.5	 1.5	 1.5	 y
38C AMD bd1
39C AMD bd2
40C AMD bd3
41C AMD bd4
42C AMD bt1	 2.67	~2.79	~2.67
43C AMD bt2	 2.0	 2.28	 2.28	 y
44C AMD zen	 1.5	 1.5	 1.5	 =
45C Intel P4	 2.8	 3.35	 3.6
46C Intel PNR	 2.0	 2.0	 2.0	 =
47C Intel NHM	 2.0	 2.0	 2.0	 =
48C Intel SBR	 1.5	 1.75	 1.75	 n
49C Intel IBR	 1.48	 1.71	 1.72	 n
50C Intel HWL	 1.5	 1.5	 1.5	 n
51C Intel BWL	 1.5	 1.5	 1.5	 n
52C Intel SKL	 1.5	 1.5	 1.5	 n
53C Intel atom	 3.82	 3.82	 3.82	 n
54C Intel SLM	 3.0	 3.0	 3.0	 =
55C VIA nano	 3.25
56
57ifdef(`OPERATION_and_n',`
58  define(`func',`mpn_and_n')
59  define(`VARIANT_1')
60  define(`LOGOP',`and')')
61ifdef(`OPERATION_andn_n',`
62  define(`func',`mpn_andn_n')
63  define(`VARIANT_2')
64  define(`LOGOP',`and')')
65ifdef(`OPERATION_nand_n',`
66  define(`func',`mpn_nand_n')
67  define(`VARIANT_3')
68  define(`LOGOP',`and')')
69ifdef(`OPERATION_ior_n',`
70  define(`func',`mpn_ior_n')
71  define(`VARIANT_1')
72  define(`LOGOP',`or')')
73ifdef(`OPERATION_iorn_n',`
74  define(`func',`mpn_iorn_n')
75  define(`VARIANT_2')
76  define(`LOGOP',`or')')
77ifdef(`OPERATION_nior_n',`
78  define(`func',`mpn_nior_n')
79  define(`VARIANT_3')
80  define(`LOGOP',`or')')
81ifdef(`OPERATION_xor_n',`
82  define(`func',`mpn_xor_n')
83  define(`VARIANT_1')
84  define(`LOGOP',`xor')')
85ifdef(`OPERATION_xnor_n',`
86  define(`func',`mpn_xnor_n')
87  define(`VARIANT_2')
88  define(`LOGOP',`xor')')
89
90
91MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
92
93C INPUT PARAMETERS
94define(`rp',`%rdi')
95define(`up',`%rsi')
96define(`vp',`%rdx')
97define(`n',`%rcx')
98
99ABI_SUPPORT(DOS64)
100ABI_SUPPORT(STD64)
101
102ASM_START()
103
104ifdef(`VARIANT_1',`
105	TEXT
106	ALIGN(32)
107PROLOGUE(func)
108	FUNC_ENTRY(4)
109	mov	(vp), %r8
110	mov	R32(%rcx), R32(%rax)
111	lea	(vp,n,8), vp
112	lea	(up,n,8), up
113	lea	(rp,n,8), rp
114	neg	n
115	and	$3, R32(%rax)
116	je	L(b00)
117	cmp	$2, R32(%rax)
118	jc	L(b01)
119	je	L(b10)
120
121L(b11):	LOGOP	(up,n,8), %r8
122	mov	%r8, (rp,n,8)
123	dec	n
124	jmp	L(e11)
125L(b10):	add	$-2, n
126	jmp	L(e10)
127L(b01):	LOGOP	(up,n,8), %r8
128	mov	%r8, (rp,n,8)
129	inc	n
130	jz	L(ret)
131
132L(top):	mov	(vp,n,8), %r8
133L(b00):	mov	8(vp,n,8), %r9
134	LOGOP	(up,n,8), %r8
135	LOGOP	8(up,n,8), %r9
136	nop				C K8/K9/K10 concession
137	mov	%r8, (rp,n,8)
138	mov	%r9, 8(rp,n,8)
139L(e11):	mov	16(vp,n,8), %r8
140L(e10):	mov	24(vp,n,8), %r9
141	LOGOP	16(up,n,8), %r8
142	LOGOP	24(up,n,8), %r9
143	mov	%r8, 16(rp,n,8)
144	mov	%r9, 24(rp,n,8)
145	add	$4, n
146	jnc	L(top)
147
148L(ret):	FUNC_EXIT()
149	ret
150EPILOGUE()
151')
152
153ifdef(`VARIANT_2',`
154	TEXT
155	ALIGN(32)
156PROLOGUE(func)
157	FUNC_ENTRY(4)
158	mov	(vp), %r8
159	not	%r8
160	mov	R32(%rcx), R32(%rax)
161	lea	(vp,n,8), vp
162	lea	(up,n,8), up
163	lea	(rp,n,8), rp
164	neg	n
165	and	$3, R32(%rax)
166	je	L(b00)
167	cmp	$2, R32(%rax)
168	jc	L(b01)
169	je	L(b10)
170
171L(b11):	LOGOP	(up,n,8), %r8
172	mov	%r8, (rp,n,8)
173	dec	n
174	jmp	L(e11)
175L(b10):	add	$-2, n
176	jmp	L(e10)
177	.byte	0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90
178L(b01):	LOGOP	(up,n,8), %r8
179	mov	%r8, (rp,n,8)
180	inc	n
181	jz	L(ret)
182
183L(top):	mov	(vp,n,8), %r8
184	not	%r8
185L(b00):	mov	8(vp,n,8), %r9
186	not	%r9
187	LOGOP	(up,n,8), %r8
188	LOGOP	8(up,n,8), %r9
189	mov	%r8, (rp,n,8)
190	mov	%r9, 8(rp,n,8)
191L(e11):	mov	16(vp,n,8), %r8
192	not	%r8
193L(e10):	mov	24(vp,n,8), %r9
194	not	%r9
195	LOGOP	16(up,n,8), %r8
196	LOGOP	24(up,n,8), %r9
197	mov	%r8, 16(rp,n,8)
198	mov	%r9, 24(rp,n,8)
199	add	$4, n
200	jnc	L(top)
201
202L(ret):	FUNC_EXIT()
203	ret
204EPILOGUE()
205')
206
207ifdef(`VARIANT_3',`
208	TEXT
209	ALIGN(32)
210PROLOGUE(func)
211	FUNC_ENTRY(4)
212	mov	(vp), %r8
213	mov	R32(%rcx), R32(%rax)
214	lea	(vp,n,8), vp
215	lea	(up,n,8), up
216	lea	(rp,n,8), rp
217	neg	n
218	and	$3, R32(%rax)
219	je	L(b00)
220	cmp	$2, R32(%rax)
221	jc	L(b01)
222	je	L(b10)
223
224L(b11):	LOGOP	(up,n,8), %r8
225	not	%r8
226	mov	%r8, (rp,n,8)
227	dec	n
228	jmp	L(e11)
229L(b10):	add	$-2, n
230	jmp	L(e10)
231	.byte	0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90
232L(b01):	LOGOP	(up,n,8), %r8
233	not	%r8
234	mov	%r8, (rp,n,8)
235	inc	n
236	jz	L(ret)
237
238L(top):	mov	(vp,n,8), %r8
239L(b00):	mov	8(vp,n,8), %r9
240	LOGOP	(up,n,8), %r8
241	not	%r8
242	LOGOP	8(up,n,8), %r9
243	not	%r9
244	mov	%r8, (rp,n,8)
245	mov	%r9, 8(rp,n,8)
246L(e11):	mov	16(vp,n,8), %r8
247L(e10):	mov	24(vp,n,8), %r9
248	LOGOP	16(up,n,8), %r8
249	not	%r8
250	LOGOP	24(up,n,8), %r9
251	not	%r9
252	mov	%r8, 16(rp,n,8)
253	mov	%r9, 24(rp,n,8)
254	add	$4, n
255	jnc	L(top)
256
257L(ret):	FUNC_EXIT()
258	ret
259EPILOGUE()
260')
261