1dnl  AMD K6-2 mpn_copyd -- copy limb vector, decrementing.
2
3dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C K6-2: 1.0 cycles/limb
35
36
37C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
38C
39C The loop here is no faster than a rep movsl at 1.0 c/l, but it avoids a 30
40C cycle startup time, which amounts for instance to a 2x speedup at 15
41C limbs.
42C
43C If dst is 4mod8 the loop would be 1.17 c/l, but that's avoided by
44C processing one limb separately to make it aligned.  This and a final odd
45C limb are handled in a branch-free fashion, ending up re-copying if the
46C special case isn't needed.
47C
48C Alternatives:
49C
50C There used to be a big unrolled version of this, running at 0.56 c/l if
51C the destination was aligned, but that seemed rather excessive for the
52C relative importance of copyd.
53C
54C If the destination alignment is ignored and just left to run at 1.17 c/l
55C some code size and a fixed few cycles can be saved.  Considering how few
56C uses copyd finds perhaps that should be favoured.  The current code has
57C the attraction of being no slower than a basic rep movsl though.
58
59defframe(PARAM_SIZE,12)
60defframe(PARAM_SRC, 8)
61defframe(PARAM_DST, 4)
62
63dnl  re-using parameter space
64define(SAVE_EBX,`PARAM_SIZE')
65
66	TEXT
67	ALIGN(16)
68
69PROLOGUE(mpn_copyd)
70deflit(`FRAME',0)
71
72	movl	PARAM_SIZE, %ecx
73	movl	%ebx, SAVE_EBX
74
75	movl	PARAM_SRC, %eax
76	movl	PARAM_DST, %edx
77
78	subl	$1, %ecx		C better code alignment than decl
79	jb	L(zero)
80
81	jz	L(one_more)
82	leal	4(%edx,%ecx,4), %ebx
83
84Zdisp(	movd,	0,(%eax,%ecx,4), %mm0)	C high limb
85Zdisp(	movd,	%mm0, 0,(%edx,%ecx,4))	C Zdisp for good code alignment
86
87	cmpl	$1, %ecx
88	je	L(one_more)
89
90	shrl	$2, %ebx
91	andl	$1, %ebx		C 1 if dst[size-2] unaligned
92
93	subl	%ebx, %ecx
94	nop				C code alignment
95
96L(top):
97	C eax	src
98	C ebx
99	C ecx	counter
100	C edx	dst
101
102	movq	-4(%eax,%ecx,4), %mm0
103	subl	$2, %ecx
104
105	movq	%mm0, 4(%edx,%ecx,4)
106	ja	L(top)
107
108
109L(one_more):
110	movd	(%eax), %mm0
111	movd	%mm0, (%edx)
112
113	movl	SAVE_EBX, %ebx
114	emms_or_femms
115L(zero):
116	ret
117
118EPILOGUE()
119