1dnl  Pentium-4 mpn_copyd -- copy limb vector, decrementing.
2dnl
3
4dnl  Copyright 1999, 2000, 2001 Free Software Foundation, Inc.
5dnl
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or
9dnl  modify it under the terms of the GNU Lesser General Public License as
10dnl  published by the Free Software Foundation; either version 3 of the
11dnl  License, or (at your option) any later version.
12dnl
13dnl  The GNU MP Library is distributed in the hope that it will be useful,
14dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
15dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16dnl  Lesser General Public License for more details.
17dnl
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21
22dnl  The std/rep/movsl/cld is very slow for small blocks on pentium4.  Its
23dnl  startup time seems to be about 165 cycles.  It then needs 2.6 c/l.
24dnl  We therefore use an open-coded 2 c/l copying loop.
25
26dnl  Ultimately, we may want to use 64-bit movq or 128-bit movdqu in some
27dnl  nifty unrolled arrangement.  Clearly, that could reach much higher
28dnl  speeds, at least for large blocks.
29
30include(`../config.m4')
31
32
33defframe(PARAM_SIZE, 12)
34defframe(PARAM_SRC, 8)
35defframe(PARAM_DST,  4)
36
37	TEXT
38	ALIGN(8)
39
40PROLOGUE(mpn_copyd)
41deflit(`FRAME',0)
42
43	movl	PARAM_SIZE, %ecx
44
45	movl	PARAM_SRC, %eax
46	movl	PARAM_DST, %edx
47	movl	%ebx, PARAM_SIZE
48	addl	$-1, %ecx
49	js	L(end)
50
51L(loop):
52	movl	(%eax,%ecx,4), %ebx
53	movl	%ebx, (%edx,%ecx,4)
54	addl	$-1, %ecx
55
56	jns	L(loop)
57L(end):
58	movl	PARAM_SIZE, %ebx
59	ret
60
61EPILOGUE()
62