1dnl  AMD64 mpn_copyd optimised for CPUs with fast SSE copying and SSSE3.
2
3dnl  Copyright 2012, 2015 Free Software Foundation, Inc.
4
5dnl  Contributed to the GNU project by Torbjorn Granlund.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb     cycles/limb     cycles/limb      good
36C              aligned	      unaligned	      best seen	     for cpu?
37C AMD K8,K9	 2.0		 illop		1.0/1.0		N
38C AMD K10	 0.85		 illop				Y/N
39C AMD bull	 0.70		 0.70				Y
40C AMD pile	 0.68		 0.68				Y
41C AMD steam
42C AMD excavator
43C AMD bobcat	 1.97		 8.24		1.5/1.5		N
44C AMD jaguar	 0.77		 0.89		0.65/opt	N/Y
45C Intel P4	 2.26		 illop				Y/N
46C Intel core	 0.52		 0.80		opt/opt		Y
47C Intel NHM	 0.52		 0.64		opt/opt		Y
48C Intel SBR	 0.51		 0.51		opt/opt		Y
49C Intel IBR	 0.50		 0.50		opt/opt		Y
50C Intel HWL	 0.50		 0.51		opt/opt		Y
51C Intel BWL	 0.55		 0.55		opt/opt		Y
52C Intel atom	 1.16		 1.66		opt/opt		Y
53C Intel SLM	 1.02		 1.04		opt/opt		Y
54C VIA nano	 1.08		 1.06		opt/opt		Y
55
56C We use only 16-byte operations, except for unaligned top-most and bottom-most
57C limbs.  We use the SSSE3 palignr instruction when rp - up = 8 (mod 16).
58C
59C For operands of < COPYD_SSE_THRESHOLD limbs, we use a plain 64-bit loop,
60C taken from the x86_64 default code.
61
62C INPUT PARAMETERS
63define(`rp', `%rdi')
64define(`up', `%rsi')
65define(`n',  `%rdx')
66
67C There are three instructions for loading an aligned 128-bit quantity.  We use
68C movaps, since it has the shortest coding.
69define(`movdqa', ``movaps'')
70
71ifdef(`COPYD_SSE_THRESHOLD',`',`define(`COPYD_SSE_THRESHOLD', 7)')
72
73ASM_START()
74	TEXT
75	ALIGN(64)
76PROLOGUE(mpn_copyd)
77	FUNC_ENTRY(3)
78
79	lea	-8(up,n,8), up
80	lea	-8(rp,n,8), rp
81
82	cmp	$COPYD_SSE_THRESHOLD, n
83	jbe	L(bc)
84
85	test	$8, R8(rp)		C is rp 16-byte aligned?
86	jnz	L(rp_aligned)		C jump if rp aligned
87
88	mov	(up), %rax		C copy one limb
89	mov	%rax, (rp)
90	lea	-8(up), up
91	lea	-8(rp), rp
92	dec	n
93
94L(rp_aligned):
95	test	$8, R8(up)
96	jz	L(uent)
97
98ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1,
99`	sub	$8, n',
100`	jmp	L(am)')
101
102	ALIGN(16)
103L(atop):movdqa	-8(up), %xmm0
104	movdqa	-24(up), %xmm1
105	movdqa	-40(up), %xmm2
106	movdqa	-56(up), %xmm3
107	lea	-64(up), up
108	movdqa	%xmm0, -8(rp)
109	movdqa	%xmm1, -24(rp)
110	movdqa	%xmm2, -40(rp)
111	movdqa	%xmm3, -56(rp)
112	lea	-64(rp), rp
113L(am):	sub	$8, n
114	jnc	L(atop)
115
116	test	$4, R8(n)
117	jz	1f
118	movdqa	-8(up), %xmm0
119	movdqa	-24(up), %xmm1
120	lea	-32(up), up
121	movdqa	%xmm0, -8(rp)
122	movdqa	%xmm1, -24(rp)
123	lea	-32(rp), rp
124
1251:	test	$2, R8(n)
126	jz	1f
127	movdqa	-8(up), %xmm0
128	lea	-16(up), up
129	movdqa	%xmm0, -8(rp)
130	lea	-16(rp), rp
131
1321:	test	$1, R8(n)
133	jz	1f
134	mov	(up), %r8
135	mov	%r8, (rp)
136
1371:	FUNC_EXIT()
138	ret
139
140L(uent):sub	$16, n
141	movdqa	(up), %xmm0
142	jc	L(uend)
143
144	ALIGN(16)
145L(utop):sub	$16, n
146	movdqa	-16(up), %xmm1
147	palignr($8, %xmm1, %xmm0)
148	movdqa	%xmm0, -8(rp)
149	movdqa	-32(up), %xmm2
150	palignr($8, %xmm2, %xmm1)
151	movdqa	%xmm1, -24(rp)
152	movdqa	-48(up), %xmm3
153	palignr($8, %xmm3, %xmm2)
154	movdqa	%xmm2, -40(rp)
155	movdqa	-64(up), %xmm0
156	palignr($8, %xmm0, %xmm3)
157	movdqa	%xmm3, -56(rp)
158	movdqa	-80(up), %xmm1
159	palignr($8, %xmm1, %xmm0)
160	movdqa	%xmm0, -72(rp)
161	movdqa	-96(up), %xmm2
162	palignr($8, %xmm2, %xmm1)
163	movdqa	%xmm1, -88(rp)
164	movdqa	-112(up), %xmm3
165	palignr($8, %xmm3, %xmm2)
166	movdqa	%xmm2, -104(rp)
167	movdqa	-128(up), %xmm0
168	palignr($8, %xmm0, %xmm3)
169	movdqa	%xmm3, -120(rp)
170	lea	-128(up), up
171	lea	-128(rp), rp
172	jnc	L(utop)
173
174L(uend):test	$8, R8(n)
175	jz	1f
176	movdqa	-16(up), %xmm1
177	palignr($8, %xmm1, %xmm0)
178	movdqa	%xmm0, -8(rp)
179	movdqa	-32(up), %xmm0
180	palignr($8, %xmm0, %xmm1)
181	movdqa	%xmm1, -24(rp)
182	movdqa	-48(up), %xmm1
183	palignr($8, %xmm1, %xmm0)
184	movdqa	%xmm0, -40(rp)
185	movdqa	-64(up), %xmm0
186	palignr($8, %xmm0, %xmm1)
187	movdqa	%xmm1, -56(rp)
188	lea	-64(up), up
189	lea	-64(rp), rp
190
1911:	test	$4, R8(n)
192	jz	1f
193	movdqa	-16(up), %xmm1
194	palignr($8, %xmm1, %xmm0)
195	movdqa	%xmm0, -8(rp)
196	movdqa	-32(up), %xmm0
197	palignr($8, %xmm0, %xmm1)
198	movdqa	%xmm1, -24(rp)
199	lea	-32(up), up
200	lea	-32(rp), rp
201
2021:	test	$2, R8(n)
203	jz	1f
204	movdqa	-16(up), %xmm1
205	palignr($8, %xmm1, %xmm0)
206	movdqa	%xmm0, -8(rp)
207	lea	-16(up), up
208	lea	-16(rp), rp
209
2101:	test	$1, R8(n)
211	jz	1f
212	mov	(up), %r8
213	mov	%r8, (rp)
214
2151:	FUNC_EXIT()
216	ret
217
218C Basecase code.  Needed for good small operands speed, not for
219C correctness as the above code is currently written.
220
221L(bc):	sub	$4, R32(n)
222	jc	L(end)
223
224	ALIGN(16)
225L(top):	mov	(up), %r8
226	mov	-8(up), %r9
227	lea	-32(rp), rp
228	mov	-16(up), %r10
229	mov	-24(up), %r11
230	lea	-32(up), up
231	mov	%r8, 32(rp)
232	mov	%r9, 24(rp)
233ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1,
234`	sub	$4, R32(n)')
235	mov	%r10, 16(rp)
236	mov	%r11, 8(rp)
237ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1,
238`	jnc	L(top)')
239
240L(end):	test	$1, R8(n)
241	jz	1f
242	mov	(up), %r8
243	mov	%r8, (rp)
244	lea	-8(rp), rp
245	lea	-8(up), up
2461:	test	$2, R8(n)
247	jz	1f
248	mov	(up), %r8
249	mov	-8(up), %r9
250	mov	%r8, (rp)
251	mov	%r9, -8(rp)
2521:	FUNC_EXIT()
253	ret
254EPILOGUE()
255