1dnl  AMD64 mpn_copyi optimised for CPUs with fast AVX.
2
3dnl  Copyright 2003, 2005, 2007, 2011-2013, 2015 Free Software Foundation, Inc.
4
5dnl  Contributed to the GNU project by Torbj��rn Granlund.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb     cycles/limb     cycles/limb      good
36C              aligned	      unaligned	      best seen	     for cpu?
37C AMD K8,K9	n/a
38C AMD K10	n/a
39C AMD bull	n/a
40C AMD pile	 4.87		 4.87				N
41C AMD steam	 ?		 ?
42C AMD bobcat	n/a
43C AMD jaguar	n/a
44C Intel P4	n/a
45C Intel core	n/a
46C Intel NHM	n/a
47C Intel SBR	 0.50		 0.91				N
48C Intel IBR	 0.50		 0.65				N
49C Intel HWL	 0.25		 0.30				Y
50C Intel BWL	 0.28		 0.37				Y
51C Intel atom	n/a
52C VIA nano	n/a
53
54C We try to do as many 32-byte operations as possible.  The top-most and
55C bottom-most writes might need 8-byte operations.  For the bulk copying, we
56C write using aligned 32-byte operations, but we read with both aligned and
57C unaligned 32-byte operations.
58
59define(`rp', `%rdi')
60define(`up', `%rsi')
61define(`n',  `%rdx')
62
63ABI_SUPPORT(DOS64)
64ABI_SUPPORT(STD64)
65
66dnl define(`vmovdqu', vlddqu)
67
68ASM_START()
69	TEXT
70	ALIGN(32)
71PROLOGUE(mpn_copyi)
72	FUNC_ENTRY(3)
73
74	cmp	$7, n
75	jbe	L(bc)
76
77	test	$8, R8(rp)		C is rp 16-byte aligned?
78	jz	L(a2)			C jump if rp aligned
79	mov	(up), %rax
80	lea	8(up), up
81	mov	%rax, (rp)
82	lea	8(rp), rp
83	dec	n
84L(a2):	test	$16, R8(rp)		C is rp 32-byte aligned?
85	jz	L(a3)			C jump if rp aligned
86	vmovdqu	(up), %xmm0
87	lea	16(up), up
88	vmovdqa	%xmm0, (rp)
89	lea	16(rp), rp
90	sub	$2, n
91L(a3):	sub	$16, n
92	jc	L(sma)
93
94	ALIGN(16)
95L(top):	vmovdqu	(up), %ymm0
96	vmovdqu	32(up), %ymm1
97	vmovdqu	64(up), %ymm2
98	vmovdqu	96(up), %ymm3
99	lea	128(up), up
100	vmovdqa	%ymm0, (rp)
101	vmovdqa	%ymm1, 32(rp)
102	vmovdqa	%ymm2, 64(rp)
103	vmovdqa	%ymm3, 96(rp)
104	lea	128(rp), rp
105L(ali):	sub	$16, n
106	jnc	L(top)
107
108L(sma):	test	$8, R8(n)
109	jz	1f
110	vmovdqu	(up), %ymm0
111	vmovdqu	32(up), %ymm1
112	lea	64(up), up
113	vmovdqa	%ymm0, (rp)
114	vmovdqa	%ymm1, 32(rp)
115	lea	64(rp), rp
1161:
117	test	$4, R8(n)
118	jz	1f
119	vmovdqu	(up), %ymm0
120	lea	32(up), up
121	vmovdqa	%ymm0, (rp)
122	lea	32(rp), rp
1231:
124	test	$2, R8(n)
125	jz	1f
126	vmovdqu	(up), %xmm0
127	lea	16(up), up
128	vmovdqa	%xmm0, (rp)
129	lea	16(rp), rp
1301:
131L(end):	test	$1, R8(n)
132	jz	1f
133	mov	(up), %r8
134	mov	%r8, (rp)
1351:
136	FUNC_EXIT()
137	ret
138
139	ALIGN(16)
140L(bc):	test	$4, R8(n)
141	jz	1f
142	mov	(up), %rax
143	mov	8(up), %rcx
144	mov	16(up), %r8
145	mov	24(up), %r9
146	lea	32(up), up
147	mov	%rax, (rp)
148	mov	%rcx, 8(rp)
149	mov	%r8, 16(rp)
150	mov	%r9, 24(rp)
151	lea	32(rp), rp
1521:
153	test	$2, R8(n)
154	jz	1f
155	mov	(up), %rax
156	mov	8(up), %rcx
157	lea	16(up), up
158	mov	%rax, (rp)
159	mov	%rcx, 8(rp)
160	lea	16(rp), rp
1611:
162	test	$1, R8(n)
163	jz	1f
164	mov	(up), %rax
165	mov	%rax, (rp)
1661:
167	FUNC_EXIT()
168	ret
169EPILOGUE()
170