1dnl  AMD K6 mpn_add/sub_n -- mpn addition or subtraction.
2
3dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb.
24
25
26ifdef(`OPERATION_add_n', `
27	define(M4_inst,        adcl)
28	define(M4_function_n,  mpn_add_n)
29	define(M4_function_nc, mpn_add_nc)
30	define(M4_description, add)
31',`ifdef(`OPERATION_sub_n', `
32	define(M4_inst,        sbbl)
33	define(M4_function_n,  mpn_sub_n)
34	define(M4_function_nc, mpn_sub_nc)
35	define(M4_description, subtract)
36',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
37')')')
38
39MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
40
41
42C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
43C                          mp_size_t size);
44C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
45C	                      mp_size_t size, mp_limb_t carry);
46C
47C Calculate src1,size M4_description src2,size, and store the result in
48C dst,size.  The return value is the carry bit from the top of the result
49C (1 or 0).
50C
51C The _nc version accepts 1 or 0 for an initial carry into the low limb of
52C the calculation.  Note values other than 1 or 0 here will lead to garbage
53C results.
54C
55C Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and
56C an in-place dst+=src to 2.5 c/l.  The unrolled loops have 1 cycle/loop of
57C loop control, which with 4 limbs/loop means an extra 0.25 c/l.
58
59define(PARAM_CARRY, `FRAME+20(%esp)')
60define(PARAM_SIZE,  `FRAME+16(%esp)')
61define(PARAM_SRC2,  `FRAME+12(%esp)')
62define(PARAM_SRC1,  `FRAME+8(%esp)')
63define(PARAM_DST,   `FRAME+4(%esp)')
64deflit(`FRAME',0)
65
66dnl  minimum 5 because the unrolled code can't handle less
67deflit(UNROLL_THRESHOLD, 5)
68
69	TEXT
70	ALIGN(32)
71
72PROLOGUE(M4_function_nc)
73	movl	PARAM_CARRY, %eax
74	jmp	L(start)
75EPILOGUE()
76
77
78PROLOGUE(M4_function_n)
79	xorl	%eax, %eax
80L(start):
81	movl	PARAM_SIZE, %ecx
82	pushl	%ebx
83FRAME_pushl()
84
85	movl	PARAM_SRC1, %ebx
86	pushl	%edi
87FRAME_pushl()
88
89	movl	PARAM_SRC2, %edx
90	cmpl	$UNROLL_THRESHOLD, %ecx
91
92	movl	PARAM_DST, %edi
93	jae	L(unroll)
94
95
96	shrl	%eax		C initial carry flag
97
98	C offset 0x21 here, close enough to aligned
99L(simple):
100	C eax	scratch
101	C ebx	src1
102	C ecx	counter
103	C edx	src2
104	C esi
105	C edi	dst
106	C ebp
107	C
108	C The store to (%edi) could be done with a stosl; it'd be smaller
109	C code, but there's no speed gain and a cld would have to be added
110	C (per mpn/x86/README).
111
112	movl	(%ebx), %eax
113	leal	4(%ebx), %ebx
114
115	M4_inst	(%edx), %eax
116
117	movl	%eax, (%edi)
118	leal	4(%edi), %edi
119
120	leal	4(%edx), %edx
121	loop	L(simple)
122
123
124	movl	$0, %eax
125	popl	%edi
126
127	setc	%al
128
129	popl	%ebx
130	ret
131
132
133C -----------------------------------------------------------------------------
134L(unroll):
135	C eax	carry
136	C ebx	src1
137	C ecx	counter
138	C edx	src2
139	C esi
140	C edi	dst
141	C ebp
142
143	cmpl	%edi, %ebx
144	pushl	%esi
145
146	je	L(inplace)
147
148ifdef(`OPERATION_add_n',`
149	cmpl	%edi, %edx
150
151	je	L(inplace_reverse)
152')
153
154	movl	%ecx, %esi
155
156	andl	$-4, %ecx
157	andl	$3, %esi
158
159	leal	(%ebx,%ecx,4), %ebx
160	leal	(%edx,%ecx,4), %edx
161	leal	(%edi,%ecx,4), %edi
162
163	negl	%ecx
164	shrl	%eax
165
166	ALIGN(32)
167L(normal_top):
168	C eax	counter, qwords, negative
169	C ebx	src1
170	C ecx	scratch
171	C edx	src2
172	C esi
173	C edi	dst
174	C ebp
175
176	movl	(%ebx,%ecx,4), %eax
177	leal	5(%ecx), %ecx
178	M4_inst	-20(%edx,%ecx,4), %eax
179	movl	%eax, -20(%edi,%ecx,4)
180
181	movl	4-20(%ebx,%ecx,4), %eax
182	M4_inst	4-20(%edx,%ecx,4), %eax
183	movl	%eax, 4-20(%edi,%ecx,4)
184
185	movl	8-20(%ebx,%ecx,4), %eax
186	M4_inst	8-20(%edx,%ecx,4), %eax
187	movl	%eax, 8-20(%edi,%ecx,4)
188
189	movl	12-20(%ebx,%ecx,4), %eax
190	M4_inst	12-20(%edx,%ecx,4), %eax
191	movl	%eax, 12-20(%edi,%ecx,4)
192
193	loop	L(normal_top)
194
195
196	decl	%esi
197	jz	L(normal_finish_one)
198	js	L(normal_done)
199
200	C two or three more limbs
201
202	movl	(%ebx), %eax
203	M4_inst	(%edx), %eax
204	movl	%eax, (%edi)
205
206	movl	4(%ebx), %eax
207	M4_inst	4(%edx), %eax
208	decl	%esi
209	movl	%eax, 4(%edi)
210
211	jz	L(normal_done)
212	movl	$2, %ecx
213
214L(normal_finish_one):
215	movl	(%ebx,%ecx,4), %eax
216	M4_inst	(%edx,%ecx,4), %eax
217	movl	%eax, (%edi,%ecx,4)
218
219L(normal_done):
220	popl	%esi
221	popl	%edi
222
223	movl	$0, %eax
224	popl	%ebx
225
226	setc	%al
227
228	ret
229
230
231C -----------------------------------------------------------------------------
232
233ifdef(`OPERATION_add_n',`
234L(inplace_reverse):
235	C dst==src2
236
237	movl	%ebx, %edx
238')
239
240L(inplace):
241	C eax	initial carry
242	C ebx
243	C ecx	size
244	C edx	src
245	C esi
246	C edi	dst
247	C ebp
248
249	leal	-1(%ecx), %esi
250	decl	%ecx
251
252	andl	$-4, %ecx
253	andl	$3, %esi
254
255	movl	(%edx), %ebx		C src low limb
256	leal	(%edx,%ecx,4), %edx
257
258	leal	(%edi,%ecx,4), %edi
259	negl	%ecx
260
261	shrl	%eax
262
263
264	ALIGN(32)
265L(inplace_top):
266	C eax
267	C ebx	next src limb
268	C ecx	size
269	C edx	src
270	C esi
271	C edi	dst
272	C ebp
273
274	M4_inst	%ebx, (%edi,%ecx,4)
275
276	movl	4(%edx,%ecx,4), %eax
277	leal	5(%ecx), %ecx
278
279	M4_inst	%eax, 4-20(%edi,%ecx,4)
280
281	movl	8-20(%edx,%ecx,4), %eax
282	movl	12-20(%edx,%ecx,4), %ebx
283
284	M4_inst	%eax, 8-20(%edi,%ecx,4)
285	M4_inst	%ebx, 12-20(%edi,%ecx,4)
286
287	movl	16-20(%edx,%ecx,4), %ebx
288	loop	L(inplace_top)
289
290
291	C now %esi is 0 to 3 representing respectively 1 to 4 limbs more
292
293	M4_inst	%ebx, (%edi)
294
295	decl	%esi
296	jz	L(inplace_finish_one)
297	js	L(inplace_done)
298
299	C two or three more limbs
300
301	movl	4(%edx), %eax
302	movl	8(%edx), %ebx
303	M4_inst	%eax, 4(%edi)
304	M4_inst	%ebx, 8(%edi)
305
306	decl	%esi
307	movl	$2, %ecx
308
309	jz	L(normal_done)
310
311L(inplace_finish_one):
312	movl	4(%edx,%ecx,4), %eax
313	M4_inst	%eax, 4(%edi,%ecx,4)
314
315L(inplace_done):
316	popl	%esi
317	popl	%edi
318
319	movl	$0, %eax
320	popl	%ebx
321
322	setc	%al
323
324	ret
325
326EPILOGUE()
327