1dnl  AMD K7 mpn_copyd -- copy limb vector, decrementing.
2
3dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C    alignment dst/src, A=0mod8 N=4mod8
24C       A/A   A/N   N/A   N/N
25C K7    0.75  1.0   1.0   0.75
26
27
28C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
29C
30C The various comments in mpn/x86/k7/copyi.asm apply here too.
31
32defframe(PARAM_SIZE,12)
33defframe(PARAM_SRC, 8)
34defframe(PARAM_DST, 4)
35deflit(`FRAME',0)
36
37dnl  parameter space reused
38define(SAVE_EBX,`PARAM_SIZE')
39define(SAVE_ESI,`PARAM_SRC')
40
41dnl  minimum 5 since the unrolled code can't handle less than 5
42deflit(UNROLL_THRESHOLD, 5)
43
44	TEXT
45	ALIGN(32)
46PROLOGUE(mpn_copyd)
47
48	movl	PARAM_SIZE, %ecx
49	movl	%ebx, SAVE_EBX
50
51	movl	PARAM_SRC, %eax
52	movl	PARAM_DST, %edx
53
54	cmpl	$UNROLL_THRESHOLD, %ecx
55	jae	L(unroll)
56
57	orl	%ecx, %ecx
58	jz	L(simple_done)
59
60L(simple):
61	C eax	src
62	C ebx	scratch
63	C ecx	counter
64	C edx	dst
65	C
66	C this loop is 2 cycles/limb
67
68	movl	-4(%eax,%ecx,4), %ebx
69	movl	%ebx, -4(%edx,%ecx,4)
70	decl	%ecx
71	jnz	L(simple)
72
73L(simple_done):
74	movl	SAVE_EBX, %ebx
75	ret
76
77
78L(unroll):
79	movl	%esi, SAVE_ESI
80	leal	(%eax,%ecx,4), %ebx
81	leal	(%edx,%ecx,4), %esi
82
83	andl	%esi, %ebx
84	movl	SAVE_ESI, %esi
85	subl	$4, %ecx		C size-4
86
87	testl	$4, %ebx   C testl to pad code closer to 16 bytes for L(top)
88	jz	L(aligned)
89
90	C both src and dst unaligned, process one limb to align them
91	movl	12(%eax,%ecx,4), %ebx
92	movl	%ebx, 12(%edx,%ecx,4)
93	decl	%ecx
94L(aligned):
95
96
97	ALIGN(16)
98L(top):
99	C eax	src
100	C ebx
101	C ecx	counter, limbs
102	C edx	dst
103
104	movq	8(%eax,%ecx,4), %mm0
105	movq	(%eax,%ecx,4), %mm1
106	subl	$4, %ecx
107	movq	%mm0, 16+8(%edx,%ecx,4)
108	movq	%mm1, 16(%edx,%ecx,4)
109	jns	L(top)
110
111
112	C now %ecx is -4 to -1 representing respectively 0 to 3 limbs remaining
113
114	testb	$2, %cl
115	jz	L(finish_not_two)
116
117	movq	8(%eax,%ecx,4), %mm0
118	movq	%mm0, 8(%edx,%ecx,4)
119L(finish_not_two):
120
121	testb	$1, %cl
122	jz	L(done)
123
124	movl	(%eax), %ebx
125	movl	%ebx, (%edx)
126
127L(done):
128	movl	SAVE_EBX, %ebx
129	emms
130	ret
131
132
133EPILOGUE()
134