1193323Seddnl  AMD64 mpn_copyi optimised for CPUs with fast SSE copying and SSSE3.
2193323Sed
3193323Seddnl  Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
4193323Sed
5193323Seddnl  Contributed to the GNU project by Torbj��rn Granlund.
6193323Sed
7193323Seddnl  This file is part of the GNU MP Library.
8193323Seddnl
9193323Seddnl  The GNU MP Library is free software; you can redistribute it and/or modify
10193323Seddnl  it under the terms of either:
11193323Seddnl
12193323Seddnl    * the GNU Lesser General Public License as published by the Free
13193323Seddnl      Software Foundation; either version 3 of the License, or (at your
14193323Seddnl      option) any later version.
15193323Seddnl
16193323Seddnl  or
17193323Seddnl
18193323Seddnl    * the GNU General Public License as published by the Free Software
19193323Seddnl      Foundation; either version 2 of the License, or (at your option) any
20193323Seddnl      later version.
21193323Seddnl
22193323Seddnl  or both in parallel, as here.
23193323Seddnl
24193323Seddnl  The GNU MP Library is distributed in the hope that it will be useful, but
25193323Seddnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26193323Seddnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27193323Seddnl  for more details.
28193323Seddnl
29193323Seddnl  You should have received copies of the GNU General Public License and the
30243830Sdimdnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31243830Sdimdnl  see https://www.gnu.org/licenses/.
32193323Sed
33193323Sedinclude(`../config.m4')
34193323Sed
35193323SedC	     cycles/limb     cycles/limb     cycles/limb      good
36193323SedC              aligned	      unaligned	      best seen	     for cpu?
37193323SedC AMD K8,K9	 2.0		 illop		1.0/1.0		N
38193323SedC AMD K10	 0.85		 illop				Y/N
39193323SedC AMD bd1	 0.70		 0.66				Y
40193323SedC AMD bd2	 0.68		 0.66				Y
41193323SedC AMD bd3	 ?		 ?
42193323SedC AMD bd4	 ?		 ?
43193323SedC AMD bt1	 1.97		 8.16		1.5/1.5		N
44193323SedC AMD bt2	 0.77		 0.93		0.65/opt	N/Y
45193323SedC AMD zn1	 ?		 ?
46193323SedC AMD zn2	 ?		 ?
47193323SedC Intel P4	 2.26		 illop				Y/N
48193323SedC Intel CNR	 0.52		 0.64		opt/opt		Y
49243830SdimC Intel NHM	 0.52		 0.71		0.50/0.67	N
50193323SedC Intel SBR	 0.51		 0.54		opt/0.51	Y
51193323SedC Intel IBR	 0.50		 0.54		opt/opt		Y
52243830SdimC Intel HWL	 0.50		 0.51		opt/opt		Y
53193323SedC Intel BWL	 0.55		 0.55		opt/opt		Y
54193323SedC Intel atom	 1.16		 1.61		opt/opt		Y
55193323SedC Intel SLM	 1.02		 1.07		opt/opt		Y
56193323SedC VIA nano	 1.09		 1.08		opt/opt		Y
57193323Sed
58243830SdimC We use only 16-byte operations, except for unaligned top-most and bottom-most
59193323SedC limbs.  We use the SSSE3 palignr instruction when rp - up = 8 (mod 16).  That
60193323SedC instruction is better adapted to mpn_copyd's needs, we need to contort the
61193323SedC code to use it here.
62193323SedC
63243830SdimC For operands of < COPYI_SSE_THRESHOLD limbs, we use a plain 64-bit loop,
64193323SedC taken from the x86_64 default code.
65193323Sed
66193323SedC INPUT PARAMETERS
67193323Seddefine(`rp', `%rdi')
68243830Sdimdefine(`up', `%rsi')
69193323Seddefine(`n',  `%rdx')
70193323Sed
71193323SedC There are three instructions for loading an aligned 128-bit quantity.  We use
72193323SedC movaps, since it has the shortest coding.
73243830Sdimdnl define(`movdqa', ``movaps'')
74193323Sed
75193323Sedifdef(`COPYI_SSE_THRESHOLD',`',`define(`COPYI_SSE_THRESHOLD', 7)')
76193323Sed
77193323SedASM_START()
78243830Sdim	TEXT
79193323Sed	ALIGN(64)
80193323SedPROLOGUE(mpn_copyi)
81193323Sed	FUNC_ENTRY(3)
82193323Sed
83243830Sdim	cmp	$COPYI_SSE_THRESHOLD, n
84193323Sed	jbe	L(bc)
85193323Sed
86193323Sed	test	$8, R8(rp)		C is rp 16-byte aligned?
87193323Sed	jz	L(rp_aligned)		C jump if rp aligned
88243830Sdim
89193323Sed	movsq				C copy one limb
90193323Sed	dec	n
91193323Sed
92193323SedL(rp_aligned):
93193323Sed	test	$8, R8(up)
94243830Sdim	jnz	L(uent)
95193323Sed
96193323Sedifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
97193323Sed`	sub	$8, n',
98193323Sed`	jmp	L(am)')
99193323Sed
100243830Sdim	ALIGN(16)
101243830SdimL(atop):movdqa	0(up), %xmm0
102193323Sed	movdqa	16(up), %xmm1
103193323Sed	movdqa	32(up), %xmm2
104193323Sed	movdqa	48(up), %xmm3
105193323Sed	lea	64(up), up
106193323Sed	movdqa	%xmm0, (rp)
107193323Sed	movdqa	%xmm1, 16(rp)
108193323Sed	movdqa	%xmm2, 32(rp)
109243830Sdim	movdqa	%xmm3, 48(rp)
110193323Sed	lea	64(rp), rp
111193323SedL(am):	sub	$8, n
112193323Sed	jnc	L(atop)
113193323Sed
114193323Sed	test	$4, R8(n)
115193323Sed	jz	1f
116193323Sed	movdqa	(up), %xmm0
117243830Sdim	movdqa	16(up), %xmm1
118218893Sdim	lea	32(up), up
119193323Sed	movdqa	%xmm0, (rp)
120193323Sed	movdqa	%xmm1, 16(rp)
121193323Sed	lea	32(rp), rp
122193323Sed
123193323Sed1:	test	$2, R8(n)
124218893Sdim	jz	1f
125193323Sed	movdqa	(up), %xmm0
126218893Sdim	lea	16(up), up
127193323Sed	movdqa	%xmm0, (rp)
128193323Sed	lea	16(rp), rp
129243830Sdim
130243830Sdim1:	test	$1, R8(n)
131243830Sdim	jz	1f
132243830Sdim	mov	(up), %r8
133243830Sdim	mov	%r8, (rp)
134243830Sdim
135243830Sdim1:	FUNC_EXIT()
136243830Sdim	ret
137243830Sdim
138243830SdimL(uent):
139243830SdimC Code handling up - rp = 8 (mod 16)
140243830Sdim
141243830Sdim	cmp	$16, n
142243830Sdim	jc	L(ued0)
143243830Sdim
144243830SdimIFDOS(`	add	$-56, %rsp	')
145243830SdimIFDOS(`	movdqa	%xmm6, (%rsp)	')
146243830SdimIFDOS(`	movdqa	%xmm7, 16(%rsp)	')
147243830SdimIFDOS(`	movdqa	%xmm8, 32(%rsp)	')
148243830Sdim
149243830Sdim	movaps	120(up), %xmm7
150243830Sdim	movaps	104(up), %xmm6
151243830Sdim	movaps	88(up), %xmm5
152193323Sed	movaps	72(up), %xmm4
153243830Sdim	movaps	56(up), %xmm3
154243830Sdim	movaps	40(up), %xmm2
155243830Sdim	lea	128(up), up
156193323Sed	sub	$32, n
157193323Sed	jc	L(ued1)
158193323Sed
159193323Sed	ALIGN(16)
160243830SdimL(utop):movaps	-104(up), %xmm1
161193323Sed	sub	$16, n
162193323Sed	movaps	-120(up), %xmm0
163193323Sed	palignr($8, %xmm6, %xmm7)
164193323Sed	movaps	-136(up), %xmm8
165193323Sed	movdqa	%xmm7, 112(rp)
166243830Sdim	palignr($8, %xmm5, %xmm6)
167193323Sed	movaps	120(up), %xmm7
168193323Sed	movdqa	%xmm6, 96(rp)
169193323Sed	palignr($8, %xmm4, %xmm5)
170193323Sed	movaps	104(up), %xmm6
171193323Sed	movdqa	%xmm5, 80(rp)
172234353Sdim	palignr($8, %xmm3, %xmm4)
173263508Sdim	movaps	88(up), %xmm5
174234353Sdim	movdqa	%xmm4, 64(rp)
175234353Sdim	palignr($8, %xmm2, %xmm3)
176234353Sdim	movaps	72(up), %xmm4
177234353Sdim	movdqa	%xmm3, 48(rp)
178193323Sed	palignr($8, %xmm1, %xmm2)
179210299Sed	movaps	56(up), %xmm3
180210299Sed	movdqa	%xmm2, 32(rp)
181210299Sed	palignr($8, %xmm0, %xmm1)
182210299Sed	movaps	40(up), %xmm2
183210299Sed	movdqa	%xmm1, 16(rp)
184210299Sed	palignr($8, %xmm8, %xmm0)
185210299Sed	lea	128(up), up
186210299Sed	movdqa	%xmm0, (rp)
187193323Sed	lea	128(rp), rp
188243830Sdim	jnc	L(utop)
189243830Sdim
190243830SdimL(ued1):movaps	-104(up), %xmm1
191243830Sdim	movaps	-120(up), %xmm0
192243830Sdim	movaps	-136(up), %xmm8
193243830Sdim	palignr($8, %xmm6, %xmm7)
194243830Sdim	movdqa	%xmm7, 112(rp)
195243830Sdim	palignr($8, %xmm5, %xmm6)
196243830Sdim	movdqa	%xmm6, 96(rp)
197243830Sdim	palignr($8, %xmm4, %xmm5)
198243830Sdim	movdqa	%xmm5, 80(rp)
199243830Sdim	palignr($8, %xmm3, %xmm4)
200243830Sdim	movdqa	%xmm4, 64(rp)
201243830Sdim	palignr($8, %xmm2, %xmm3)
202243830Sdim	movdqa	%xmm3, 48(rp)
203243830Sdim	palignr($8, %xmm1, %xmm2)
204243830Sdim	movdqa	%xmm2, 32(rp)
205243830Sdim	palignr($8, %xmm0, %xmm1)
206243830Sdim	movdqa	%xmm1, 16(rp)
207243830Sdim	palignr($8, %xmm8, %xmm0)
208243830Sdim	movdqa	%xmm0, (rp)
209243830Sdim	lea	128(rp), rp
210243830Sdim
211193323SedIFDOS(`	movdqa	(%rsp), %xmm6	')
212193323SedIFDOS(`	movdqa	16(%rsp), %xmm7	')
213193323SedIFDOS(`	movdqa	32(%rsp), %xmm8	')
214193323SedIFDOS(`	add	$56, %rsp	')
215243830Sdim
216193323SedL(ued0):test	$8, R8(n)
217193323Sed	jz	1f
218193323Sed	movaps	56(up), %xmm3
219193323Sed	movaps	40(up), %xmm2
220193323Sed	movaps	24(up), %xmm1
221193323Sed	movaps	8(up), %xmm0
222243830Sdim	movaps	-8(up), %xmm4
223193323Sed	palignr($8, %xmm2, %xmm3)
224193323Sed	movdqa	%xmm3, 48(rp)
225193323Sed	palignr($8, %xmm1, %xmm2)
226193323Sed	movdqa	%xmm2, 32(rp)
227193323Sed	palignr($8, %xmm0, %xmm1)
228193323Sed	movdqa	%xmm1, 16(rp)
229193323Sed	palignr($8, %xmm4, %xmm0)
230193323Sed	lea	64(up), up
231193323Sed	movdqa	%xmm0, (rp)
232193323Sed	lea	64(rp), rp
233
2341:	test	$4, R8(n)
235	jz	1f
236	movaps	24(up), %xmm1
237	movaps	8(up), %xmm0
238	palignr($8, %xmm0, %xmm1)
239	movaps	-8(up), %xmm3
240	movdqa	%xmm1, 16(rp)
241	palignr($8, %xmm3, %xmm0)
242	lea	32(up), up
243	movdqa	%xmm0, (rp)
244	lea	32(rp), rp
245
2461:	test	$2, R8(n)
247	jz	1f
248	movdqa	8(up), %xmm0
249	movdqa	-8(up), %xmm3
250	palignr($8, %xmm3, %xmm0)
251	lea	16(up), up
252	movdqa	%xmm0, (rp)
253	lea	16(rp), rp
254
2551:	test	$1, R8(n)
256	jz	1f
257	mov	(up), %r8
258	mov	%r8, (rp)
259
2601:	FUNC_EXIT()
261	ret
262
263C Basecase code.  Needed for good small operands speed, not for
264C correctness as the above code is currently written.
265
266L(bc):	lea	-8(rp), rp
267	sub	$4, R32(n)
268	jc	L(end)
269
270	ALIGN(16)
271L(top):	mov	(up), %r8
272	mov	8(up), %r9
273	lea	32(rp), rp
274	mov	16(up), %r10
275	mov	24(up), %r11
276	lea	32(up), up
277	mov	%r8, -24(rp)
278	mov	%r9, -16(rp)
279ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
280`	sub	$4, R32(n)')
281	mov	%r10, -8(rp)
282	mov	%r11, (rp)
283ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
284`	jnc	L(top)')
285
286L(end):	test	$1, R8(n)
287	jz	1f
288	mov	(up), %r8
289	mov	%r8, 8(rp)
290	lea	8(rp), rp
291	lea	8(up), up
2921:	test	$2, R8(n)
293	jz	1f
294	mov	(up), %r8
295	mov	8(up), %r9
296	mov	%r8, 8(rp)
297	mov	%r9, 16(rp)
2981:	FUNC_EXIT()
299	ret
300EPILOGUE()
301