1dnl  Intel Pentium mpn_copyi -- copy limb vector, incrementing.
2
3dnl  Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C P5: 1.25 cycles/limb
35
36
37C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
38C
39C Destination prefetching is done to avoid repeated write-throughs on lines
40C not already in L1.
41C
42C At least one of the src or dst pointer needs to be incremented rather than
43C using indexing, so that there's somewhere to put the loop control without
44C an AGI.  Incrementing one and not two lets us keep loop overhead to 2
45C cycles.  Making it the src pointer incremented avoids an AGI on the %ecx
46C subtracts in the finishup code.
47C
48C The block of finishup code is almost as big as the main loop itself, which
49C is unfortunate, but it's faster that way than with say rep movsl, by about
50C 10 cycles for instance on P55.
51C
52C There's nothing to be gained from MMX on P55, since it can do only one
53C movq load (or store) per cycle, so the throughput would be the same as the
54C code here (and even then only if src and dst have the same alignment mod
55C 8).
56
57defframe(PARAM_SIZE,12)
58defframe(PARAM_SRC, 8)
59defframe(PARAM_DST, 4)
60
61	TEXT
62	ALIGN(8)
63PROLOGUE(mpn_copyi)
64deflit(`FRAME',0)
65
66	movl	PARAM_SIZE, %ecx
67	movl	PARAM_DST, %edx
68
69	pushl	%ebx	FRAME_pushl()
70	pushl	%esi	FRAME_pushl()
71
72	leal	(%edx,%ecx,4), %edx	C &dst[size-1]
73	xorl	$-1, %ecx		C -size-1
74
75	movl	PARAM_SRC, %esi
76	addl	$8, %ecx		C -size+7
77
78	jns	L(end)
79
80	movl	-28(%edx,%ecx,4), %eax	C fetch destination cache line, dst[0]
81	nop
82
83L(top):
84	C eax	scratch
85	C ebx	scratch
86	C ecx	counter, limbs, negative
87	C edx	&dst[size-1]
88	C esi	src, incrementing
89	C edi
90	C ebp
91
92	movl	(%edx,%ecx,4), %eax	C fetch destination cache line
93	addl	$8, %ecx
94
95	movl	(%esi), %eax		C read words pairwise
96	movl	4(%esi), %ebx
97	movl	%eax, -60(%edx,%ecx,4)	C store words pairwise
98	movl	%ebx, -56(%edx,%ecx,4)
99
100	movl	8(%esi), %eax
101	movl	12(%esi), %ebx
102	movl	%eax, -52(%edx,%ecx,4)
103	movl	%ebx, -48(%edx,%ecx,4)
104
105	movl	16(%esi), %eax
106	movl	20(%esi), %ebx
107	movl	%eax, -44(%edx,%ecx,4)
108	movl	%ebx, -40(%edx,%ecx,4)
109
110	movl	24(%esi), %eax
111	movl	28(%esi), %ebx
112	movl	%eax, -36(%edx,%ecx,4)
113	movl	%ebx, -32(%edx,%ecx,4)
114
115	leal	32(%esi), %esi
116	js	L(top)
117
118
119L(end):
120	C ecx	0 to 7, representing respectively 7 to 0 limbs remaining
121	C esi	src end
122	C edx	dst, next location to store
123
124	subl	$4, %ecx
125	jns	L(no4)
126
127	movl	(%esi), %eax
128	movl	4(%esi), %ebx
129	movl	%eax, -12(%edx,%ecx,4)
130	movl	%ebx, -8(%edx,%ecx,4)
131
132	movl	8(%esi), %eax
133	movl	12(%esi), %ebx
134	movl	%eax, -4(%edx,%ecx,4)
135	movl	%ebx, (%edx,%ecx,4)
136
137	addl	$16, %esi
138	addl	$4, %ecx
139L(no4):
140
141	subl	$2, %ecx
142	jns	L(no2)
143
144	movl	(%esi), %eax
145	movl	4(%esi), %ebx
146	movl	%eax, -4(%edx,%ecx,4)
147	movl	%ebx, (%edx,%ecx,4)
148
149	addl	$8, %esi
150	addl	$2, %ecx
151L(no2):
152
153	jnz	L(done)
154
155	movl	(%esi), %eax
156	movl	%eax, -4(%edx,%ecx,4)	C risk of cache bank clash here
157
158L(done):
159	popl	%esi
160	popl	%ebx
161
162	ret
163
164EPILOGUE()
165