1dnl  Intel Pentium mpn_copyi -- copy limb vector, incrementing.
2
3dnl  Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C P5: 1.25 cycles/limb
24
25
26C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
27C
28C Destination prefetching is done to avoid repeated write-throughs on lines
29C not already in L1.
30C
31C At least one of the src or dst pointer needs to be incremented rather than
32C using indexing, so that there's somewhere to put the loop control without
33C an AGI.  Incrementing one and not two lets us keep loop overhead to 2
34C cycles.  Making it the src pointer incremented avoids an AGI on the %ecx
35C subtracts in the finishup code.
36C
37C The block of finishup code is almost as big as the main loop itself, which
38C is unfortunate, but it's faster that way than with say rep movsl, by about
39C 10 cycles for instance on P55.
40C
41C There's nothing to be gained from MMX on P55, since it can do only one
42C movq load (or store) per cycle, so the throughput would be the same as the
43C code here (and even then only if src and dst have the same alignment mod
44C 8).
45
46defframe(PARAM_SIZE,12)
47defframe(PARAM_SRC, 8)
48defframe(PARAM_DST, 4)
49
50	TEXT
51	ALIGN(8)
52PROLOGUE(mpn_copyi)
53deflit(`FRAME',0)
54
55	movl	PARAM_SIZE, %ecx
56	movl	PARAM_DST, %edx
57
58	pushl	%ebx	FRAME_pushl()
59	pushl	%esi	FRAME_pushl()
60
61	leal	(%edx,%ecx,4), %edx	C &dst[size-1]
62	xorl	$-1, %ecx		C -size-1
63
64	movl	PARAM_SRC, %esi
65	addl	$8, %ecx		C -size+7
66
67	jns	L(end)
68
69	movl	-28(%edx,%ecx,4), %eax	C fetch destination cache line, dst[0]
70	nop
71
72L(top):
73	C eax	scratch
74	C ebx	scratch
75	C ecx	counter, limbs, negative
76	C edx	&dst[size-1]
77	C esi	src, incrementing
78	C edi
79	C ebp
80
81	movl	(%edx,%ecx,4), %eax	C fetch destination cache line
82	addl	$8, %ecx
83
84	movl	(%esi), %eax		C read words pairwise
85	movl	4(%esi), %ebx
86	movl	%eax, -60(%edx,%ecx,4)	C store words pairwise
87	movl	%ebx, -56(%edx,%ecx,4)
88
89	movl	8(%esi), %eax
90	movl	12(%esi), %ebx
91	movl	%eax, -52(%edx,%ecx,4)
92	movl	%ebx, -48(%edx,%ecx,4)
93
94	movl	16(%esi), %eax
95	movl	20(%esi), %ebx
96	movl	%eax, -44(%edx,%ecx,4)
97	movl	%ebx, -40(%edx,%ecx,4)
98
99	movl	24(%esi), %eax
100	movl	28(%esi), %ebx
101	movl	%eax, -36(%edx,%ecx,4)
102	movl	%ebx, -32(%edx,%ecx,4)
103
104	leal	32(%esi), %esi
105	js	L(top)
106
107
108L(end):
109	C ecx	0 to 7, representing respectively 7 to 0 limbs remaining
110	C esi	src end
111	C edx	dst, next location to store
112
113	subl	$4, %ecx
114	jns	L(no4)
115
116	movl	(%esi), %eax
117	movl	4(%esi), %ebx
118	movl	%eax, -12(%edx,%ecx,4)
119	movl	%ebx, -8(%edx,%ecx,4)
120
121	movl	8(%esi), %eax
122	movl	12(%esi), %ebx
123	movl	%eax, -4(%edx,%ecx,4)
124	movl	%ebx, (%edx,%ecx,4)
125
126	addl	$16, %esi
127	addl	$4, %ecx
128L(no4):
129
130	subl	$2, %ecx
131	jns	L(no2)
132
133	movl	(%esi), %eax
134	movl	4(%esi), %ebx
135	movl	%eax, -4(%edx,%ecx,4)
136	movl	%ebx, (%edx,%ecx,4)
137
138	addl	$8, %esi
139	addl	$2, %ecx
140L(no2):
141
142	jnz	L(done)
143
144	movl	(%esi), %eax
145	movl	%eax, -4(%edx,%ecx,4)	C risk of cache bank clash here
146
147L(done):
148	popl	%esi
149	popl	%ebx
150
151	ret
152
153EPILOGUE()
154