1dnl  AMD64 mpn_copyi optimised for CPUs with fast SSE.
2
3dnl  Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation,
4dnl  Inc.
5
6dnl  Contributed to the GNU project by Torbj��rn Granlund.
7
8dnl  This file is part of the GNU MP Library.
9dnl
10dnl  The GNU MP Library is free software; you can redistribute it and/or modify
11dnl  it under the terms of either:
12dnl
13dnl    * the GNU Lesser General Public License as published by the Free
14dnl      Software Foundation; either version 3 of the License, or (at your
15dnl      option) any later version.
16dnl
17dnl  or
18dnl
19dnl    * the GNU General Public License as published by the Free Software
20dnl      Foundation; either version 2 of the License, or (at your option) any
21dnl      later version.
22dnl
23dnl  or both in parallel, as here.
24dnl
25dnl  The GNU MP Library is distributed in the hope that it will be useful, but
26dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
28dnl  for more details.
29dnl
30dnl  You should have received copies of the GNU General Public License and the
31dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
32dnl  see https://www.gnu.org/licenses/.
33
34include(`../config.m4')
35
36C	     cycles/limb     cycles/limb     cycles/limb      good
37C              aligned	      unaligned	      best seen	     for cpu?
38C AMD K8,K9
39C AMD K10	 0.85		 1.64				Y/N
40C AMD bull	 1.4		 1.4				N
41C AMD pile	 0.77		 0.93				N
42C AMD steam	 ?		 ?
43C AMD excavator	 ?		 ?
44C AMD bobcat
45C AMD jaguar	 0.65		 1.02		opt/0.93	Y/N
46C Intel P4	 2.3		 2.3				Y
47C Intel core	 1.0		 1.0		0.52/0.64	N
48C Intel NHM	 0.5		 0.67				Y
49C Intel SBR	 0.51		 0.75		opt/0.54	Y/N
50C Intel IBR	 0.50		 0.57		opt/0.54	Y
51C Intel HWL	 0.50		 0.57		opt/0.51	Y
52C Intel BWL	 0.55		 0.62		opt/0.55	Y
53C Intel atom
54C Intel SLM	 1.02		 1.27		opt/1.07	Y/N
55C VIA nano	 1.16		 5.16				Y/N
56
57C We try to do as many 16-byte operations as possible.  The top-most and
58C bottom-most writes might need 8-byte operations.  We can always write using
59C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
60C operations.
61
62C Instead of having separate loops for reading aligned and unaligned, we read
63C using MOVDQU.  This seems to work great except for core2; there performance
64C doubles when reading using MOVDQA (for aligned source).  It is unclear how to
65C best handle the unaligned case there.
66
67C INPUT PARAMETERS
68define(`rp', `%rdi')
69define(`up', `%rsi')
70define(`n',  `%rdx')
71
72ABI_SUPPORT(DOS64)
73ABI_SUPPORT(STD64)
74
75dnl define(`movdqu', lddqu)
76
77ASM_START()
78	TEXT
79	ALIGN(64)
80PROLOGUE(mpn_copyi)
81	FUNC_ENTRY(3)
82
83	cmp	$3, n			C NB: bc code below assumes this limit
84	jc	L(bc)
85
86	test	$8, R8(rp)		C is rp 16-byte aligned?
87	jz	L(ali)			C jump if rp aligned
88	movsq				C copy single limb
89	dec	n
90
91L(ali):	sub	$16, n
92	jc	L(sma)
93
94IFDOS(`	add	$-56, %rsp	')
95IFDOS(`	movdqa	%xmm6, (%rsp)	')
96IFDOS(`	movdqa	%xmm7, 16(%rsp)	')
97
98	ALIGN(16)
99L(top):	movdqu	(up), %xmm0
100	movdqu	16(up), %xmm1
101	movdqu	32(up), %xmm2
102	movdqu	48(up), %xmm3
103	movdqu	64(up), %xmm4
104	movdqu	80(up), %xmm5
105	movdqu	96(up), %xmm6
106	movdqu	112(up), %xmm7
107	lea	128(up), up
108	movdqa	%xmm0, (rp)
109	movdqa	%xmm1, 16(rp)
110	movdqa	%xmm2, 32(rp)
111	movdqa	%xmm3, 48(rp)
112	movdqa	%xmm4, 64(rp)
113	movdqa	%xmm5, 80(rp)
114	movdqa	%xmm6, 96(rp)
115	movdqa	%xmm7, 112(rp)
116	lea	128(rp), rp
117	sub	$16, n
118	jnc	L(top)
119
120IFDOS(`	movdqa	(%rsp), %xmm6	')
121IFDOS(`	movdqa	16(%rsp), %xmm7	')
122IFDOS(`	add	$56, %rsp	')
123
124L(sma):	test	$8, R8(n)
125	jz	1f
126	movdqu	(up), %xmm0
127	movdqu	16(up), %xmm1
128	movdqu	32(up), %xmm2
129	movdqu	48(up), %xmm3
130	lea	64(up), up
131	movdqa	%xmm0, (rp)
132	movdqa	%xmm1, 16(rp)
133	movdqa	%xmm2, 32(rp)
134	movdqa	%xmm3, 48(rp)
135	lea	64(rp), rp
1361:
137	test	$4, R8(n)
138	jz	1f
139	movdqu	(up), %xmm0
140	movdqu	16(up), %xmm1
141	lea	32(up), up
142	movdqa	%xmm0, (rp)
143	movdqa	%xmm1, 16(rp)
144	lea	32(rp), rp
1451:
146	test	$2, R8(n)
147	jz	1f
148	movdqu	(up), %xmm0
149	lea	16(up), up
150	movdqa	%xmm0, (rp)
151	lea	16(rp), rp
152	ALIGN(16)
1531:
154L(end):	test	$1, R8(n)
155	jz	1f
156	mov	(up), %r8
157	mov	%r8, (rp)
1581:
159	FUNC_EXIT()
160	ret
161
162C Basecase code.  Needed for good small operands speed, not for correctness as
163C the above code is currently written.  The commented-out lines need to be
164C reinstated if this code is to be used for n > 3, and then the post loop
165C offsets need fixing.
166
167L(bc):	sub	$2, n
168	jc	L(end)
169	ALIGN(16)
1701:	mov	(up), %rax
171	mov	8(up), %rcx
172dnl	lea	16(up), up
173	mov	%rax, (rp)
174	mov	%rcx, 8(rp)
175dnl	lea	16(rp), rp
176dnl	sub	$2, n
177dnl	jnc	1b
178
179	test	$1, R8(n)
180	jz	L(ret)
181	mov	16(up), %rax
182	mov	%rax, 16(rp)
183L(ret):	FUNC_EXIT()
184	ret
185EPILOGUE()
186