1dnl  Intel Pentium-4 mpn_addlsh1_n -- mpn x+2*y.
2
3dnl  Copyright 2001, 2002, 2003, 2004, 2006 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C          cycles/limb (approx)
24C          dst!=src1,2  dst==src1  dst==src2
25C P4 m2:      4.5         ?7.25      ?6.75
26C P4 m3:      5.3         ?	     ?
27
28C mp_limb_t mpn_addlsh1_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
29C                          mp_size_t size);
30C
31C The slightly strange combination of indexing and pointer incrementing
32C that's used seems to work best.  Not sure why, but %ecx,4 with src1 and/or
33C src2 is a slowdown.
34C
35C The dependent chain is simply the paddq of x+2*y to the previous carry,
36C then psrlq to get the new carry.  That makes 4 c/l the target speed, which
37C is almost achieved for separate src/dst but when src==dst the write
38C combining anomalies slow it down.
39
40defframe(PARAM_SIZE, 16)
41defframe(PARAM_SRC2, 12)
42defframe(PARAM_SRC1, 8)
43defframe(PARAM_DST,  4)
44
45dnl  re-use parameter space
46define(SAVE_EBX,`PARAM_SRC1')
47
48	TEXT
49	ALIGN(8)
50
51PROLOGUE(mpn_addlsh1_n)
52deflit(`FRAME',0)
53
54	movl	PARAM_SRC1, %eax
55	movl	%ebx, SAVE_EBX
56
57	movl	PARAM_SRC2, %ebx
58	pxor	%mm0, %mm0		C initial carry
59
60	movl	PARAM_DST, %edx
61
62	movl	PARAM_SIZE, %ecx
63
64	leal	(%edx,%ecx,4), %edx	C dst end
65	negl	%ecx			C -size
66
67L(top):
68	C eax	src1 end
69	C ebx	src2 end
70	C ecx	counter, limbs, negative
71	C edx	dst end
72	C mm0	carry
73
74	movd	(%eax), %mm1
75	movd	(%ebx), %mm2
76	psrlq	$32, %mm0
77	leal	4(%eax), %eax
78	leal	4(%ebx), %ebx
79
80	paddq	%mm2, %mm1
81	paddq	%mm2, %mm1
82
83	paddq	%mm1, %mm0
84
85	movd	%mm0, (%edx,%ecx,4)
86	addl	$1, %ecx
87	jnz	L(top)
88
89
90	psrlq	$32, %mm0
91	movl	SAVE_EBX, %ebx
92	movd	%mm0, %eax
93	emms
94	ret
95
96EPILOGUE()
97