1dnl  Intel Pentium-4 mpn_add_n -- mpn addition.
2
3dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C P4 Willamette, Northwood: 4.0 cycles/limb if dst!=src1 and dst!=src2
24C			    6.0 cycles/limb if dst==src1 or dst==src2
25C P4 Prescott:		    >= 5 cycles/limb
26
27C mp_limb_t mpn_add_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
28C                      mp_size_t size);
29C mp_limb_t mpn_add_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
30C                       mp_size_t size, mp_limb_t carry);
31C
32C The 4 c/l achieved here isn't particularly good, but is better than 9 c/l
33C for a basic adc loop.
34
35defframe(PARAM_CARRY,20)
36defframe(PARAM_SIZE, 16)
37defframe(PARAM_SRC2, 12)
38defframe(PARAM_SRC1, 8)
39defframe(PARAM_DST,  4)
40
41dnl  re-use parameter space
42define(SAVE_EBX,`PARAM_SRC1')
43
44	TEXT
45	ALIGN(8)
46
47PROLOGUE(mpn_add_nc)
48deflit(`FRAME',0)
49
50	movd	PARAM_CARRY, %mm0
51	jmp	L(start_nc)
52
53EPILOGUE()
54
55	ALIGN(8)
56PROLOGUE(mpn_add_n)
57deflit(`FRAME',0)
58
59	pxor	%mm0, %mm0
60
61L(start_nc):
62	movl	PARAM_SRC1, %eax
63	movl	%ebx, SAVE_EBX
64	movl	PARAM_SRC2, %ebx
65	movl	PARAM_DST, %edx
66	movl	PARAM_SIZE, %ecx
67
68	leal	(%eax,%ecx,4), %eax	C src1 end
69	leal	(%ebx,%ecx,4), %ebx	C src2 end
70	leal	(%edx,%ecx,4), %edx	C dst end
71	negl	%ecx			C -size
72
73L(top):
74	C eax	src1 end
75	C ebx	src2 end
76	C ecx	counter, limbs, negative
77	C edx	dst end
78	C mm0	carry bit
79
80	movd	(%eax,%ecx,4), %mm1
81	movd	(%ebx,%ecx,4), %mm2
82	paddq	%mm2, %mm1
83
84	paddq	%mm1, %mm0
85	movd	%mm0, (%edx,%ecx,4)
86
87	psrlq	$32, %mm0
88
89	addl	$1, %ecx
90	jnz	L(top)
91
92
93	movd	%mm0, %eax
94	movl	SAVE_EBX, %ebx
95	emms
96	ret
97
98EPILOGUE()
99