1dnl  AMD64 mpn_copyd optimised for CPUs with fast AVX.
2
3dnl  Copyright 2003, 2005, 2007, 2011-2013, 2015 Free Software Foundation, Inc.
4
5dnl  Contributed to the GNU project by Torbj��rn Granlund.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb     cycles/limb     cycles/limb      good
36C              aligned	      unaligned	      best seen	     for cpu?
37C AMD K8,K9	n/a
38C AMD K10	n/a
39C AMD bull	n/a
40C AMD pile	 4.87		 4.87				N
41C AMD steam	 ?		 ?
42C AMD bobcat	n/a
43C AMD jaguar	n/a
44C Intel P4	n/a
45C Intel core	n/a
46C Intel NHM	n/a
47C Intel SBR	 0.50		 0.91				N
48C Intel IBR	 0.50		 0.65				N
49C Intel HWL	 0.25		 0.30				Y
50C Intel BWL	 0.28		 0.37				Y
51C Intel atom	n/a
52C VIA nano	n/a
53
54C We try to do as many 32-byte operations as possible.  The top-most and
55C bottom-most writes might need 8-byte operations.  For the bulk copying, we
56C write using aligned 32-byte operations, but we read with both aligned and
57C unaligned 32-byte operations.
58
59define(`rp', `%rdi')
60define(`up', `%rsi')
61define(`n',  `%rdx')
62
63ABI_SUPPORT(DOS64)
64ABI_SUPPORT(STD64)
65
66dnl define(`vmovdqu', vlddqu)
67
68ASM_START()
69	TEXT
70	ALIGN(32)
71PROLOGUE(mpn_copyd)
72	FUNC_ENTRY(3)
73
74	lea	-32(rp,n,8), rp
75	lea	-32(up,n,8), up
76
77	cmp	$7, n			C basecase needed for correctness
78	jbe	L(bc)
79
80	test	$8, R8(rp)		C is rp 16-byte aligned?
81	jz	L(a2)			C jump if rp aligned
82	mov	24(up), %rax
83	lea	-8(up), up
84	mov	%rax, 24(rp)
85	lea	-8(rp), rp
86	dec	n
87L(a2):	test	$16, R8(rp)		C is rp 32-byte aligned?
88	jz	L(a3)			C jump if rp aligned
89	vmovdqu	16(up), %xmm0
90	lea	-16(up), up
91	vmovdqa	%xmm0, 16(rp)
92	lea	-16(rp), rp
93	sub	$2, n
94L(a3):	sub	$16, n
95	jc	L(sma)
96
97	ALIGN(16)
98L(top):	vmovdqu	(up), %ymm0
99	vmovdqu	-32(up), %ymm1
100	vmovdqu	-64(up), %ymm2
101	vmovdqu	-96(up), %ymm3
102	lea	-128(up), up
103	vmovdqa	%ymm0, (rp)
104	vmovdqa	%ymm1, -32(rp)
105	vmovdqa	%ymm2, -64(rp)
106	vmovdqa	%ymm3, -96(rp)
107	lea	-128(rp), rp
108L(ali):	sub	$16, n
109	jnc	L(top)
110
111L(sma):	test	$8, R8(n)
112	jz	1f
113	vmovdqu	(up), %ymm0
114	vmovdqu	-32(up), %ymm1
115	lea	-64(up), up
116	vmovdqa	%ymm0, (rp)
117	vmovdqa	%ymm1, -32(rp)
118	lea	-64(rp), rp
1191:
120	test	$4, R8(n)
121	jz	1f
122	vmovdqu	(up), %ymm0
123	lea	-32(up), up
124	vmovdqa	%ymm0, (rp)
125	lea	-32(rp), rp
1261:
127	test	$2, R8(n)
128	jz	1f
129	vmovdqu	16(up), %xmm0
130	lea	-16(up), up
131	vmovdqa	%xmm0, 16(rp)
132	lea	-16(rp), rp
1331:
134	test	$1, R8(n)
135	jz	1f
136	mov	24(up), %r8
137	mov	%r8, 24(rp)
1381:
139	FUNC_EXIT()
140	ret
141
142	ALIGN(16)
143L(bc):	test	$4, R8(n)
144	jz	1f
145	mov	24(up), %rax
146	mov	16(up), %rcx
147	mov	8(up), %r8
148	mov	(up), %r9
149	lea	-32(up), up
150	mov	%rax, 24(rp)
151	mov	%rcx, 16(rp)
152	mov	%r8, 8(rp)
153	mov	%r9, (rp)
154	lea	-32(rp), rp
1551:
156	test	$2, R8(n)
157	jz	1f
158	mov	24(up), %rax
159	mov	16(up), %rcx
160	lea	-16(up), up
161	mov	%rax, 24(rp)
162	mov	%rcx, 16(rp)
163	lea	-16(rp), rp
1641:
165	test	$1, R8(n)
166	jz	1f
167	mov	24(up), %rax
168	mov	%rax, 24(rp)
1691:
170	FUNC_EXIT()
171	ret
172EPILOGUE()
173