1dnl  Intel Pentium mpn_copyd -- copy limb vector, decrementing.
2
3dnl  Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C P5: 1.25 cycles/limb
24
25
26C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
27C
28C See comments in copyi.asm.
29
30defframe(PARAM_SIZE,12)
31defframe(PARAM_SRC, 8)
32defframe(PARAM_DST, 4)
33
34	TEXT
35	ALIGN(8)
36PROLOGUE(mpn_copyd)
37deflit(`FRAME',0)
38
39	movl	PARAM_SRC, %eax
40	movl	PARAM_SIZE, %ecx
41
42	pushl	%esi	FRAME_pushl()
43	pushl	%edi	FRAME_pushl()
44
45	leal	-4(%eax,%ecx,4), %eax		C &src[size-1]
46	movl	PARAM_DST, %edx
47
48	subl	$7, %ecx			C size-7
49	jle	L(end)
50
51	movl	28-4(%edx,%ecx,4), %esi		C prefetch cache, dst[size-1]
52	nop
53
54L(top):
55	C eax	src, decrementing
56	C ebx
57	C ecx	counter, limbs
58	C edx	dst
59	C esi	scratch
60	C edi	scratch
61	C ebp
62
63	movl	28-32(%edx,%ecx,4), %esi	C prefetch dst cache line
64	subl	$8, %ecx
65
66	movl	(%eax), %esi			C read words pairwise
67	movl	-4(%eax), %edi
68	movl	%esi, 56(%edx,%ecx,4)		C store words pairwise
69	movl	%edi, 52(%edx,%ecx,4)
70
71	movl	-8(%eax), %esi
72	movl	-12(%eax), %edi
73	movl	%esi, 48(%edx,%ecx,4)
74	movl	%edi, 44(%edx,%ecx,4)
75
76	movl	-16(%eax), %esi
77	movl	-20(%eax), %edi
78	movl	%esi, 40(%edx,%ecx,4)
79	movl	%edi, 36(%edx,%ecx,4)
80
81	movl	-24(%eax), %esi
82	movl	-28(%eax), %edi
83	movl	%esi, 32(%edx,%ecx,4)
84	movl	%edi, 28(%edx,%ecx,4)
85
86	leal	-32(%eax), %eax
87	jg	L(top)
88
89
90L(end):
91	C ecx	-7 to 0, representing respectively 0 to 7 limbs remaining
92	C eax	src end
93	C edx	dst, next location to store
94
95	addl	$4, %ecx
96	jle	L(no4)
97
98	movl	(%eax), %esi
99	movl	-4(%eax), %edi
100	movl	%esi, 8(%edx,%ecx,4)
101	movl	%edi, 4(%edx,%ecx,4)
102
103	movl	-8(%eax), %esi
104	movl	-12(%eax), %edi
105	movl	%esi, (%edx,%ecx,4)
106	movl	%edi, -4(%edx,%ecx,4)
107
108	subl	$16, %eax
109	subl	$4, %ecx
110L(no4):
111
112	addl	$2, %ecx
113	jle	L(no2)
114
115	movl	(%eax), %esi
116	movl	-4(%eax), %edi
117	movl	%esi, (%edx,%ecx,4)
118	movl	%edi, -4(%edx,%ecx,4)
119
120	subl	$8, %eax
121	subl	$2, %ecx
122L(no2):
123
124	jnz	L(done)
125
126	movl	(%eax), %ecx
127	movl	%ecx, (%edx)	C risk of cache bank clash here
128
129L(done):
130	popl	%edi
131	popl	%esi
132
133	ret
134
135EPILOGUE()
136