1dnl  AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
2
3dnl  Copyright 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C K7: 1.64 cycles/limb (at 16 limbs/loop).
24
25
26
27dnl  K7: UNROLL_COUNT cycles/limb
28dnl           8           1.9
29dnl          16           1.64
30dnl          32           1.7
31dnl          64           2.0
32dnl  Maximum possible with the current code is 64.
33
34deflit(UNROLL_COUNT, 16)
35
36
37ifdef(`OPERATION_add_n', `
38	define(M4_inst,        adcl)
39	define(M4_function_n,  mpn_add_n)
40	define(M4_function_nc, mpn_add_nc)
41	define(M4_description, add)
42',`ifdef(`OPERATION_sub_n', `
43	define(M4_inst,        sbbl)
44	define(M4_function_n,  mpn_sub_n)
45	define(M4_function_nc, mpn_sub_nc)
46	define(M4_description, subtract)
47',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
48')')')
49
50MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
51
52
53C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
54C                         mp_size_t size);
55C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
56C	                   mp_size_t size, mp_limb_t carry);
57C
58C Calculate src1,size M4_description src2,size, and store the result in
59C dst,size.  The return value is the carry bit from the top of the result (1
60C or 0).
61C
62C The _nc version accepts 1 or 0 for an initial carry into the low limb of
63C the calculation.  Note values other than 1 or 0 here will lead to garbage
64C results.
65C
66C This code runs at 1.64 cycles/limb, which might be the best possible with
67C plain integer operations.  Each limb is 2 loads and 1 store, any 2 of
68C which can be done each cycle, leading to 1.5 c/l.
69
70dnl  Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
71ifdef(`PIC',`
72deflit(UNROLL_THRESHOLD, 8)
73',`
74deflit(UNROLL_THRESHOLD, 8)
75')
76
77defframe(PARAM_CARRY,20)
78defframe(PARAM_SIZE, 16)
79defframe(PARAM_SRC2, 12)
80defframe(PARAM_SRC1, 8)
81defframe(PARAM_DST,  4)
82
83defframe(SAVE_EBP, -4)
84defframe(SAVE_ESI, -8)
85defframe(SAVE_EBX, -12)
86defframe(SAVE_EDI, -16)
87deflit(STACK_SPACE, 16)
88
89	TEXT
90	ALIGN(32)
91deflit(`FRAME',0)
92
93PROLOGUE(M4_function_nc)
94	movl	PARAM_CARRY, %eax
95	jmp	L(start)
96EPILOGUE()
97
98PROLOGUE(M4_function_n)
99
100	xorl	%eax, %eax	C carry
101L(start):
102	movl	PARAM_SIZE, %ecx
103	subl	$STACK_SPACE, %esp
104deflit(`FRAME',STACK_SPACE)
105
106	movl	%edi, SAVE_EDI
107	movl	%ebx, SAVE_EBX
108	cmpl	$UNROLL_THRESHOLD, %ecx
109
110	movl	PARAM_SRC2, %edx
111	movl	PARAM_SRC1, %ebx
112	jae	L(unroll)
113
114	movl	PARAM_DST, %edi
115	leal	(%ebx,%ecx,4), %ebx
116	leal	(%edx,%ecx,4), %edx
117
118	leal	(%edi,%ecx,4), %edi
119	negl	%ecx
120	shrl	%eax
121
122	C This loop in in a single 16 byte code block already, so no
123	C alignment necessary.
124L(simple):
125	C eax	scratch
126	C ebx	src1
127	C ecx	counter
128	C edx	src2
129	C esi
130	C edi	dst
131	C ebp
132
133	movl	(%ebx,%ecx,4), %eax
134	M4_inst	(%edx,%ecx,4), %eax
135	movl	%eax, (%edi,%ecx,4)
136	incl	%ecx
137	jnz	L(simple)
138
139	movl	$0, %eax
140	movl	SAVE_EDI, %edi
141
142	movl	SAVE_EBX, %ebx
143	setc	%al
144	addl	$STACK_SPACE, %esp
145
146	ret
147
148
149C -----------------------------------------------------------------------------
150	C This is at 0x55, close enough to aligned.
151L(unroll):
152deflit(`FRAME',STACK_SPACE)
153	movl	%ebp, SAVE_EBP
154	andl	$-2, %ecx		C size low bit masked out
155	andl	$1, PARAM_SIZE		C size low bit kept
156
157	movl	%ecx, %edi
158	decl	%ecx
159	movl	PARAM_DST, %ebp
160
161	shrl	$UNROLL_LOG2, %ecx
162	negl	%edi
163	movl	%esi, SAVE_ESI
164
165	andl	$UNROLL_MASK, %edi
166
167ifdef(`PIC',`
168	call	L(pic_calc)
169L(here):
170',`
171	leal	L(entry) (%edi,%edi,8), %esi	C 9 bytes per
172')
173	negl	%edi
174	shrl	%eax
175
176	leal	ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx
177	leal	ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx
178	leal	ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi
179
180	jmp	*%esi
181
182
183ifdef(`PIC',`
184L(pic_calc):
185	C See mpn/x86/README about old gas bugs
186	leal	(%edi,%edi,8), %esi
187	addl	$L(entry)-L(here), %esi
188	addl	(%esp), %esi
189	ret_internal
190')
191
192
193C -----------------------------------------------------------------------------
194	ALIGN(32)
195L(top):
196	C eax	zero
197	C ebx	src1
198	C ecx	counter
199	C edx	src2
200	C esi	scratch (was computed jump)
201	C edi	dst
202	C ebp	scratch
203
204	leal	UNROLL_BYTES(%edx), %edx
205
206L(entry):
207deflit(CHUNK_COUNT, 2)
208forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
209	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
210	deflit(`disp1', eval(disp0 + 4))
211
212Zdisp(	movl,	disp0,(%ebx), %esi)
213	movl	disp1(%ebx), %ebp
214Zdisp(	M4_inst,disp0,(%edx), %esi)
215Zdisp(	movl,	%esi, disp0,(%edi))
216	M4_inst	disp1(%edx), %ebp
217	movl	%ebp, disp1(%edi)
218')
219
220	decl	%ecx
221	leal	UNROLL_BYTES(%ebx), %ebx
222	leal	UNROLL_BYTES(%edi), %edi
223	jns	L(top)
224
225
226	mov	PARAM_SIZE, %esi
227	movl	SAVE_EBP, %ebp
228	movl	$0, %eax
229
230	decl	%esi
231	js	L(even)
232
233	movl	(%ebx), %ecx
234	M4_inst	UNROLL_BYTES(%edx), %ecx
235	movl	%ecx, (%edi)
236L(even):
237
238	movl	SAVE_EDI, %edi
239	movl	SAVE_EBX, %ebx
240	setc	%al
241
242	movl	SAVE_ESI, %esi
243	addl	$STACK_SPACE, %esp
244
245	ret
246
247EPILOGUE()
248