1dnl  AMD64 mpn_com optimised for CPUs with fast SSE copying and SSSE3.
2
3dnl  Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
4
5dnl  Contributed to the GNU project by Torbjorn Granlund.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb     cycles/limb     cycles/limb      good
36C              aligned	      unaligned	      best seen	     for cpu?
37C AMD K8,K9	 2.0		 illop		1.0/1.0		N
38C AMD K10	 0.85		 illop				Y/N
39C AMD bd1	 1.39		 ? 1.45				Y/N
40C AMD bd2     0.8-1.4	       0.7-1.4				Y
41C AMD bd3
42C AMD bd4
43C AMD bobcat	 1.97		 ? 8.17		1.5/1.5		N
44C AMD jaguar	 1.02		 1.02		0.91/0.91	N
45C Intel P4	 2.26		 illop				Y/N
46C Intel core	 0.58		 0.87		opt/0.74	Y
47C Intel NHM	 0.64		 1.14		opt/bad		Y
48C Intel SBR	 0.51		 0.65		opt/opt		Y
49C Intel IBR	 0.50		 0.64		opt/0.57	Y
50C Intel HWL	 0.51		 0.58		opt/opt		Y
51C Intel BWL	 0.52		 0.64		opt/opt		Y
52C Intel SKL	 0.51		 0.63		opt/opt		Y
53C Intel atom	 1.16		 1.70		opt/opt		Y
54C Intel SLM	 1.02		 1.52				N
55C VIA nano	 1.09		 1.10		opt/opt		Y
56
57C We use only 16-byte operations, except for unaligned top-most and bottom-most
58C limbs.  We use the SSSE3 palignr instruction when rp - up = 8 (mod 16).  That
59C instruction is better adapted to mpn_copyd's needs, we need to contort the
60C code to use it here.
61C
62C For operands of < COM_SSE_THRESHOLD limbs, we use a plain 64-bit loop, taken
63C from the x86_64 default code.
64
65C INPUT PARAMETERS
66define(`rp', `%rdi')
67define(`up', `%rsi')
68define(`n',  `%rdx')
69
70C There are three instructions for loading an aligned 128-bit quantity.  We use
71C movaps, since it has the shortest coding.
72define(`movdqa', ``movaps'')
73
74ifdef(`COM_SSE_THRESHOLD',`',`define(`COM_SSE_THRESHOLD', 7)')
75
76ASM_START()
77	TEXT
78	ALIGN(64)
79PROLOGUE(mpn_com)
80	FUNC_ENTRY(3)
81
82	cmp	$COM_SSE_THRESHOLD, n
83	jbe	L(bc)
84
85	pcmpeqb	%xmm5, %xmm5		C set to 111...111
86
87	test	$8, R8(rp)		C is rp 16-byte aligned?
88	jz	L(rp_aligned)		C jump if rp aligned
89
90	mov	(up), %r8
91	lea	8(up), up
92	not	%r8
93	mov	%r8, (rp)
94	lea	8(rp), rp
95	dec	n
96
97L(rp_aligned):
98	test	$8, R8(up)
99	jnz	L(uent)
100
101ifelse(eval(COM_SSE_THRESHOLD >= 8),1,
102`	sub	$8, n',
103`	jmp	L(am)')
104
105	ALIGN(16)
106L(atop):movdqa	0(up), %xmm0
107	movdqa	16(up), %xmm1
108	movdqa	32(up), %xmm2
109	movdqa	48(up), %xmm3
110	lea	64(up), up
111	pxor	%xmm5, %xmm0
112	pxor	%xmm5, %xmm1
113	pxor	%xmm5, %xmm2
114	pxor	%xmm5, %xmm3
115	movdqa	%xmm0, (rp)
116	movdqa	%xmm1, 16(rp)
117	movdqa	%xmm2, 32(rp)
118	movdqa	%xmm3, 48(rp)
119	lea	64(rp), rp
120L(am):	sub	$8, n
121	jnc	L(atop)
122
123	test	$4, R8(n)
124	jz	1f
125	movdqa	(up), %xmm0
126	movdqa	16(up), %xmm1
127	lea	32(up), up
128	pxor	%xmm5, %xmm0
129	pxor	%xmm5, %xmm1
130	movdqa	%xmm0, (rp)
131	movdqa	%xmm1, 16(rp)
132	lea	32(rp), rp
133
1341:	test	$2, R8(n)
135	jz	1f
136	movdqa	(up), %xmm0
137	lea	16(up), up
138	pxor	%xmm5, %xmm0
139	movdqa	%xmm0, (rp)
140	lea	16(rp), rp
141
1421:	test	$1, R8(n)
143	jz	1f
144	mov	(up), %r8
145	not	%r8
146	mov	%r8, (rp)
147
1481:	FUNC_EXIT()
149	ret
150
151L(uent):
152C Code handling up - rp = 8 (mod 16)
153
154C FIXME: The code below only handles overlap if it is close to complete, or
155C quite separate: up-rp < 5 or up-up > 15 limbs
156	lea	-40(up), %rax		C 40 = 5 * GMP_LIMB_BYTES
157	sub	rp, %rax
158	cmp	$80, %rax		C 80 = (15-5) * GMP_LIMB_BYTES
159	jbe	L(bc)			C deflect to plain loop
160
161	sub	$16, n
162	jc	L(uend)
163
164	movdqa	120(up), %xmm3
165
166	sub	$16, n
167	jmp	L(um)
168
169	ALIGN(16)
170L(utop):movdqa	120(up), %xmm3
171	pxor	%xmm5, %xmm0
172	movdqa	%xmm0, -128(rp)
173	sub	$16, n
174L(um):	movdqa	104(up), %xmm2
175	palignr($8, %xmm2, %xmm3)
176	movdqa	88(up), %xmm1
177	pxor	%xmm5, %xmm3
178	movdqa	%xmm3, 112(rp)
179	palignr($8, %xmm1, %xmm2)
180	movdqa	72(up), %xmm0
181	pxor	%xmm5, %xmm2
182	movdqa	%xmm2, 96(rp)
183	palignr($8, %xmm0, %xmm1)
184	movdqa	56(up), %xmm3
185	pxor	%xmm5, %xmm1
186	movdqa	%xmm1, 80(rp)
187	palignr($8, %xmm3, %xmm0)
188	movdqa	40(up), %xmm2
189	pxor	%xmm5, %xmm0
190	movdqa	%xmm0, 64(rp)
191	palignr($8, %xmm2, %xmm3)
192	movdqa	24(up), %xmm1
193	pxor	%xmm5, %xmm3
194	movdqa	%xmm3, 48(rp)
195	palignr($8, %xmm1, %xmm2)
196	movdqa	8(up), %xmm0
197	pxor	%xmm5, %xmm2
198	movdqa	%xmm2, 32(rp)
199	palignr($8, %xmm0, %xmm1)
200	movdqa	-8(up), %xmm3
201	pxor	%xmm5, %xmm1
202	movdqa	%xmm1, 16(rp)
203	palignr($8, %xmm3, %xmm0)
204	lea	128(up), up
205	lea	128(rp), rp
206	jnc	L(utop)
207
208	pxor	%xmm5, %xmm0
209	movdqa	%xmm0, -128(rp)
210
211L(uend):test	$8, R8(n)
212	jz	1f
213	movdqa	56(up), %xmm3
214	movdqa	40(up), %xmm2
215	palignr($8, %xmm2, %xmm3)
216	movdqa	24(up), %xmm1
217	pxor	%xmm5, %xmm3
218	movdqa	%xmm3, 48(rp)
219	palignr($8, %xmm1, %xmm2)
220	movdqa	8(up), %xmm0
221	pxor	%xmm5, %xmm2
222	movdqa	%xmm2, 32(rp)
223	palignr($8, %xmm0, %xmm1)
224	movdqa	-8(up), %xmm3
225	pxor	%xmm5, %xmm1
226	movdqa	%xmm1, 16(rp)
227	palignr($8, %xmm3, %xmm0)
228	lea	64(up), up
229	pxor	%xmm5, %xmm0
230	movdqa	%xmm0, (rp)
231	lea	64(rp), rp
232
2331:	test	$4, R8(n)
234	jz	1f
235	movdqa	24(up), %xmm1
236	movdqa	8(up), %xmm0
237	palignr($8, %xmm0, %xmm1)
238	movdqa	-8(up), %xmm3
239	pxor	%xmm5, %xmm1
240	movdqa	%xmm1, 16(rp)
241	palignr($8, %xmm3, %xmm0)
242	lea	32(up), up
243	pxor	%xmm5, %xmm0
244	movdqa	%xmm0, (rp)
245	lea	32(rp), rp
246
2471:	test	$2, R8(n)
248	jz	1f
249	movdqa	8(up), %xmm0
250	movdqa	-8(up), %xmm3
251	palignr($8, %xmm3, %xmm0)
252	lea	16(up), up
253	pxor	%xmm5, %xmm0
254	movdqa	%xmm0, (rp)
255	lea	16(rp), rp
256
2571:	test	$1, R8(n)
258	jz	1f
259	mov	(up), %r8
260	not	%r8
261	mov	%r8, (rp)
262
2631:	FUNC_EXIT()
264	ret
265
266C Basecase code.  Needed for good small operands speed, not for
267C correctness as the above code is currently written.
268
269L(bc):	lea	-8(rp), rp
270	sub	$4, R32(n)
271	jc	L(end)
272
273ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
274`	ALIGN(16)')
275L(top):	mov	(up), %r8
276	mov	8(up), %r9
277	lea	32(rp), rp
278	mov	16(up), %r10
279	mov	24(up), %r11
280	lea	32(up), up
281	not	%r8
282	not	%r9
283	not	%r10
284	not	%r11
285	mov	%r8, -24(rp)
286	mov	%r9, -16(rp)
287ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
288`	sub	$4, R32(n)')
289	mov	%r10, -8(rp)
290	mov	%r11, (rp)
291ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
292`	jnc	L(top)')
293
294L(end):	test	$1, R8(n)
295	jz	1f
296	mov	(up), %r8
297	not	%r8
298	mov	%r8, 8(rp)
299	lea	8(rp), rp
300	lea	8(up), up
3011:	test	$2, R8(n)
302	jz	1f
303	mov	(up), %r8
304	mov	8(up), %r9
305	not	%r8
306	not	%r9
307	mov	%r8, 8(rp)
308	mov	%r9, 16(rp)
3091:	FUNC_EXIT()
310	ret
311EPILOGUE()
312