1dnl  Intel Pentium-4 mpn_sub_n -- mpn subtraction.
2
3dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C P4 Willamette, Northwood: 4.0 cycles/limb if dst!=src1 and dst!=src2
24C			    6.0 cycles/limb if dst==src1 or dst==src2
25C P4 Prescott:		    >= 5 cycles/limb
26
27
28C mp_limb_t mpn_sub_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
29C                      mp_size_t size);
30C mp_limb_t mpn_sub_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
31C                       mp_size_t size, mp_limb_t carry);
32C
33C The main loop code is 2x unrolled so that the carry bit can alternate
34C between mm0 and mm1.
35
36defframe(PARAM_CARRY,20)
37defframe(PARAM_SIZE, 16)
38defframe(PARAM_SRC2, 12)
39defframe(PARAM_SRC1, 8)
40defframe(PARAM_DST,  4)
41
42dnl  re-use parameter space
43define(SAVE_EBX,`PARAM_SRC1')
44
45	TEXT
46	ALIGN(8)
47
48PROLOGUE(mpn_sub_nc)
49deflit(`FRAME',0)
50
51	movd	PARAM_CARRY, %mm0
52	jmp	L(start_nc)
53
54EPILOGUE()
55
56	ALIGN(8)
57PROLOGUE(mpn_sub_n)
58deflit(`FRAME',0)
59	pxor	%mm0, %mm0
60L(start_nc):
61	movl	PARAM_SRC1, %eax
62	movl	%ebx, SAVE_EBX
63	movl	PARAM_SRC2, %ebx
64	movl	PARAM_DST, %edx
65	movl	PARAM_SIZE, %ecx
66
67	leal	(%eax,%ecx,4), %eax	C src1 end
68	leal	(%ebx,%ecx,4), %ebx	C src2 end
69	leal	(%edx,%ecx,4), %edx	C dst end
70	negl	%ecx			C -size
71
72L(top):
73	C eax	src1 end
74	C ebx	src2 end
75	C ecx	counter, limbs, negative
76	C edx	dst end
77	C mm0	carry bit
78
79	movd	(%eax,%ecx,4), %mm1
80	movd	(%ebx,%ecx,4), %mm2
81	psubq	%mm2, %mm1
82
83	psubq	%mm0, %mm1
84	movd	%mm1, (%edx,%ecx,4)
85
86	psrlq	$63, %mm1
87
88	addl	$1, %ecx
89	jz	L(done_mm1)
90
91	movd	(%eax,%ecx,4), %mm0
92	movd	(%ebx,%ecx,4), %mm2
93	psubq	%mm2, %mm0
94
95	psubq	%mm1, %mm0
96	movd	%mm0, (%edx,%ecx,4)
97
98	psrlq	$63, %mm0
99
100	addl	$1, %ecx
101	jnz	L(top)
102
103
104	movd	%mm0, %eax
105	movl	SAVE_EBX, %ebx
106	emms
107	ret
108
109L(done_mm1):
110	movd	%mm1, %eax
111	movl	SAVE_EBX, %ebx
112	emms
113	ret
114
115EPILOGUE()
116