copyd.asm revision 1.1.1.1
1dnl  AMD K6-2 mpn_copyd -- copy limb vector, decrementing.
2
3dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C K6-2: 1.0 cycles/limb
24
25
26C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
27C
28C The loop here is no faster than a rep movsl at 1.0 c/l, but it avoids a 30
29C cycle startup time, which amounts for instance to a 2x speedup at 15
30C limbs.
31C
32C If dst is 4mod8 the loop would be 1.17 c/l, but that's avoided by
33C processing one limb separately to make it aligned.  This and a final odd
34C limb are handled in a branch-free fashion, ending up re-copying if the
35C special case isn't needed.
36C
37C Alternatives:
38C
39C There used to be a big unrolled version of this, running at 0.56 c/l if
40C the destination was aligned, but that seemed rather excessive for the
41C relative importance of copyd.
42C
43C If the destination alignment is ignored and just left to run at 1.17 c/l
44C some code size and a fixed few cycles can be saved.  Considering how few
45C uses copyd finds perhaps that should be favoured.  The current code has
46C the attraction of being no slower than a basic rep movsl though.
47
48defframe(PARAM_SIZE,12)
49defframe(PARAM_SRC, 8)
50defframe(PARAM_DST, 4)
51
52dnl  re-using parameter space
53define(SAVE_EBX,`PARAM_SIZE')
54
55	TEXT
56	ALIGN(16)
57
58PROLOGUE(mpn_copyd)
59deflit(`FRAME',0)
60
61	movl	PARAM_SIZE, %ecx
62	movl	%ebx, SAVE_EBX
63
64	movl	PARAM_SRC, %eax
65	movl	PARAM_DST, %edx
66
67	subl	$1, %ecx		C better code alignment than decl
68	jb	L(zero)
69
70	jz	L(one_more)
71	leal	4(%edx,%ecx,4), %ebx
72
73Zdisp(	movd,	0,(%eax,%ecx,4), %mm0)	C high limb
74Zdisp(	movd,	%mm0, 0,(%edx,%ecx,4))	C Zdisp for good code alignment
75
76	cmpl	$1, %ecx
77	je	L(one_more)
78
79	shrl	$2, %ebx
80	andl	$1, %ebx		C 1 if dst[size-2] unaligned
81
82	subl	%ebx, %ecx
83	nop				C code alignment
84
85L(top):
86	C eax	src
87	C ebx
88	C ecx	counter
89	C edx	dst
90
91	movq	-4(%eax,%ecx,4), %mm0
92	subl	$2, %ecx
93
94	movq	%mm0, 4(%edx,%ecx,4)
95	ja	L(top)
96
97
98L(one_more):
99	movd	(%eax), %mm0
100	movd	%mm0, (%edx)
101
102	movl	SAVE_EBX, %ebx
103	emms_or_femms
104L(zero):
105	ret
106
107EPILOGUE()
108