aorsmul_1.asm revision 1.1.1.2
1dnl  Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
2
3dnl  Copyright 1999, 2000, 2001, 2002, 2005 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C			    cycles/limb
24C P5
25C P6 model 0-8,10-12		 6.44
26C P6 model 9  (Banias)		 6.15
27C P6 model 13 (Dothan)		 6.11
28C P4 model 0  (Willamette)
29C P4 model 1  (?)
30C P4 model 2  (Northwood)
31C P4 model 3  (Prescott)
32C P4 model 4  (Nocona)
33C AMD K6
34C AMD K7
35C AMD K8
36
37
38dnl  P6 UNROLL_COUNT cycles/limb
39dnl          8           6.7
40dnl         16           6.35
41dnl         32           6.3
42dnl         64           6.3
43dnl  Maximum possible with the current code is 64.
44
45deflit(UNROLL_COUNT, 16)
46
47
48ifdef(`OPERATION_addmul_1', `
49	define(M4_inst,        addl)
50	define(M4_function_1,  mpn_addmul_1)
51	define(M4_function_1c, mpn_addmul_1c)
52	define(M4_description, add it to)
53	define(M4_desc_retval, carry)
54',`ifdef(`OPERATION_submul_1', `
55	define(M4_inst,        subl)
56	define(M4_function_1,  mpn_submul_1)
57	define(M4_function_1c, mpn_submul_1c)
58	define(M4_description, subtract it from)
59	define(M4_desc_retval, borrow)
60',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
61')')')
62
63MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
64
65
66C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
67C                            mp_limb_t mult);
68C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
69C                             mp_limb_t mult, mp_limb_t carry);
70C
71C Calculate src,size multiplied by mult and M4_description dst,size.
72C Return the M4_desc_retval limb from the top of the result.
73C
74C This code is pretty much the same as the K6 code.  The unrolled loop is
75C the same, but there's just a few scheduling tweaks in the setups and the
76C simple loop.
77C
78C A number of variations have been tried for the unrolled loop, with one or
79C two carries, and with loads scheduled earlier, but nothing faster than 6
80C cycles/limb has been found.
81
82ifdef(`PIC',`
83deflit(UNROLL_THRESHOLD, 5)
84',`
85deflit(UNROLL_THRESHOLD, 5)
86')
87
88defframe(PARAM_CARRY,     20)
89defframe(PARAM_MULTIPLIER,16)
90defframe(PARAM_SIZE,      12)
91defframe(PARAM_SRC,       8)
92defframe(PARAM_DST,       4)
93
94	TEXT
95	ALIGN(32)
96
97PROLOGUE(M4_function_1c)
98	pushl	%ebx
99deflit(`FRAME',4)
100	movl	PARAM_CARRY, %ebx
101	jmp	L(start_nc)
102EPILOGUE()
103
104PROLOGUE(M4_function_1)
105	push	%ebx
106deflit(`FRAME',4)
107	xorl	%ebx, %ebx	C initial carry
108
109L(start_nc):
110	movl	PARAM_SIZE, %ecx
111	pushl	%esi
112deflit(`FRAME',8)
113
114	movl	PARAM_SRC, %esi
115	pushl	%edi
116deflit(`FRAME',12)
117
118	movl	PARAM_DST, %edi
119	pushl	%ebp
120deflit(`FRAME',16)
121	cmpl	$UNROLL_THRESHOLD, %ecx
122
123	movl	PARAM_MULTIPLIER, %ebp
124	jae	L(unroll)
125
126
127	C simple loop
128	C this is offset 0x22, so close enough to aligned
129L(simple):
130	C eax	scratch
131	C ebx	carry
132	C ecx	counter
133	C edx	scratch
134	C esi	src
135	C edi	dst
136	C ebp	multiplier
137
138	movl	(%esi), %eax
139	addl	$4, %edi
140
141	mull	%ebp
142
143	addl	%ebx, %eax
144	adcl	$0, %edx
145
146	M4_inst	%eax, -4(%edi)
147	movl	%edx, %ebx
148
149	adcl	$0, %ebx
150	decl	%ecx
151
152	leal	4(%esi), %esi
153	jnz	L(simple)
154
155
156	popl	%ebp
157	popl	%edi
158
159	popl	%esi
160	movl	%ebx, %eax
161
162	popl	%ebx
163	ret
164
165
166
167C------------------------------------------------------------------------------
168C VAR_JUMP holds the computed jump temporarily because there's not enough
169C registers when doing the mul for the initial two carry limbs.
170C
171C The add/adc for the initial carry in %ebx is necessary only for the
172C mpn_add/submul_1c entry points.  Duplicating the startup code to
173C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good
174C idea.
175
176dnl  overlapping with parameters already fetched
177define(VAR_COUNTER,`PARAM_SIZE')
178define(VAR_JUMP,   `PARAM_DST')
179
180	C this is offset 0x43, so close enough to aligned
181L(unroll):
182	C eax
183	C ebx	initial carry
184	C ecx	size
185	C edx
186	C esi	src
187	C edi	dst
188	C ebp
189
190	movl	%ecx, %edx
191	decl	%ecx
192
193	subl	$2, %edx
194	negl	%ecx
195
196	shrl	$UNROLL_LOG2, %edx
197	andl	$UNROLL_MASK, %ecx
198
199	movl	%edx, VAR_COUNTER
200	movl	%ecx, %edx
201
202	C 15 code bytes per limb
203ifdef(`PIC',`
204	call	L(pic_calc)
205L(here):
206',`
207	shll	$4, %edx
208	negl	%ecx
209
210	leal	L(entry) (%edx,%ecx,1), %edx
211')
212	movl	(%esi), %eax		C src low limb
213
214	movl	%edx, VAR_JUMP
215	leal	ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi
216
217	mull	%ebp
218
219	addl	%ebx, %eax	C initial carry (from _1c)
220	adcl	$0, %edx
221
222	movl	%edx, %ebx	C high carry
223	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi
224
225	movl	VAR_JUMP, %edx
226	testl	$1, %ecx
227	movl	%eax, %ecx	C low carry
228
229	cmovnz(	%ebx, %ecx)	C high,low carry other way around
230	cmovnz(	%eax, %ebx)
231
232	jmp	*%edx
233
234
235ifdef(`PIC',`
236L(pic_calc):
237	shll	$4, %edx
238	negl	%ecx
239
240	C See mpn/x86/README about old gas bugs
241	leal	(%edx,%ecx,1), %edx
242	addl	$L(entry)-L(here), %edx
243
244	addl	(%esp), %edx
245
246	ret_internal
247')
248
249
250C -----------------------------------------------------------
251	ALIGN(32)
252L(top):
253deflit(`FRAME',16)
254	C eax	scratch
255	C ebx	carry hi
256	C ecx	carry lo
257	C edx	scratch
258	C esi	src
259	C edi	dst
260	C ebp	multiplier
261	C
262	C VAR_COUNTER	loop counter
263	C
264	C 15 code bytes per limb
265
266	addl	$UNROLL_BYTES, %edi
267
268L(entry):
269deflit(CHUNK_COUNT,2)
270forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
271	deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128)))
272	deflit(`disp1', eval(disp0 + 4))
273
274Zdisp(	movl,	disp0,(%esi), %eax)
275	mull	%ebp
276Zdisp(	M4_inst,%ecx, disp0,(%edi))
277	adcl	%eax, %ebx
278	movl	%edx, %ecx
279	adcl	$0, %ecx
280
281	movl	disp1(%esi), %eax
282	mull	%ebp
283	M4_inst	%ebx, disp1(%edi)
284	adcl	%eax, %ecx
285	movl	%edx, %ebx
286	adcl	$0, %ebx
287')
288
289	decl	VAR_COUNTER
290	leal	UNROLL_BYTES(%esi), %esi
291
292	jns	L(top)
293
294
295deflit(`disp0',	eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
296
297	M4_inst	%ecx, disp0(%edi)
298	movl	%ebx, %eax
299
300	popl	%ebp
301	popl	%edi
302
303	popl	%esi
304	popl	%ebx
305	adcl	$0, %eax
306
307	ret
308
309EPILOGUE()
310