1dnl  AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
2
3dnl  Copyright 1999, 2000, 2001, 2002, 2003, 2005 Free Software Foundation,
4dnl  Inc.
5dnl
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or
9dnl  modify it under the terms of the GNU Lesser General Public License as
10dnl  published by the Free Software Foundation; either version 3 of the
11dnl  License, or (at your option) any later version.
12dnl
13dnl  The GNU MP Library is distributed in the hope that it will be useful,
14dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
15dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16dnl  Lesser General Public License for more details.
17dnl
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23
24C                           cycles/limb
25C P5:
26C P6 model 0-8,10-12)            5.94
27C P6 model 9  (Banias)
28C P6 model 13 (Dothan)           5.57
29C P4 model 0  (Willamette)
30C P4 model 1  (?)
31C P4 model 2  (Northwood)
32C P4 model 3  (Prescott)
33C P4 model 4  (Nocona)
34C K6:                           7.65-8.5 (data dependent)
35C K7:
36C K8:
37
38
39dnl  K6:           large multipliers  small multipliers
40dnl  UNROLL_COUNT    cycles/limb       cycles/limb
41dnl        4             9.5              7.78
42dnl        8             9.0              7.78
43dnl       16             8.4              7.65
44dnl       32             8.4              8.2
45dnl
46dnl  Maximum possible unrolling with the current code is 32.
47dnl
48dnl  Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256
49dnl  byte block, which might explain the good speed at that unrolling.
50
51deflit(UNROLL_COUNT, 16)
52
53
54ifdef(`OPERATION_addmul_1', `
55	define(M4_inst,        addl)
56	define(M4_function_1,  mpn_addmul_1)
57	define(M4_function_1c, mpn_addmul_1c)
58',`ifdef(`OPERATION_submul_1', `
59	define(M4_inst,        subl)
60	define(M4_function_1,  mpn_submul_1)
61	define(M4_function_1c, mpn_submul_1c)
62',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
63')')')
64
65MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
66
67
68C mp_limb_t mpn_addmul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
69C                         mp_limb_t mult);
70C mp_limb_t mpn_addmul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
71C                          mp_limb_t mult, mp_limb_t carry);
72C mp_limb_t mpn_submul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
73C                         mp_limb_t mult);
74C mp_limb_t mpn_submul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
75C                          mp_limb_t mult, mp_limb_t carry);
76C
77C The jadcl0()s in the unrolled loop makes the speed data dependent.  Small
78C multipliers (most significant few bits clear) result in few carry bits and
79C speeds up to 7.65 cycles/limb are attained.  Large multipliers (most
80C significant few bits set) make the carry bits 50/50 and lead to something
81C more like 8.4 c/l.  With adcl's both of these would be 9.3 c/l.
82C
83C It's important that the gains for jadcl0 on small multipliers don't come
84C at the cost of slowing down other data.  Tests on uniformly distributed
85C random data, designed to confound branch prediction, show about a 7%
86C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all
87C overheads included).
88C
89C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus
90C 11.0 cycles/limb), and hence isn't used.
91C
92C In the simple loop, note that running ecx from negative to zero and using
93C it as an index in the two movs wouldn't help.  It would save one
94C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired
95C that would be collapsed by this.
96C
97C Attempts at a simpler main loop, with less unrolling, haven't yielded much
98C success, generally running over 9 c/l.
99C
100C
101C jadcl0
102C ------
103C
104C jadcl0() being faster than adcl $0 seems to be an artifact of two things,
105C firstly the instruction decoding and secondly the fact that there's a
106C carry bit for the jadcl0 only on average about 1/4 of the time.
107C
108C The code in the unrolled loop decodes something like the following.
109C
110C                                         decode cycles
111C		mull	%ebp                    2
112C		M4_inst	%esi, disp(%edi)        1
113C		adcl	%eax, %ecx              2
114C		movl	%edx, %esi            \ 1
115C		jnc	1f                    /
116C		incl	%esi                  \ 1
117C	1:	movl	disp(%ebx), %eax      /
118C                                              ---
119C                                               7
120C
121C In a back-to-back style test this measures 7 with the jnc not taken, or 8
122C with it taken (both when correctly predicted).  This is opposite to the
123C measurements showing small multipliers running faster than large ones.
124C Don't really know why.
125C
126C It's not clear how much branch misprediction might be costing.  The K6
127C doco says it will be 1 to 4 cycles, but presumably it's near the low end
128C of that range to get the measured results.
129C
130C
131C In the code the two carries are more or less the preceding mul product and
132C the calculation is roughly
133C
134C	x*y + u*b+v
135C
136C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and
137C v are the two limbs it's added to (being the low of the next mul, and a
138C limb from the destination).
139C
140C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and
141C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of
142C x*y/b^2.  If x, y, u and v are random and uniformly distributed between 0
143C and b-1, then the total probability can be summed over x and y,
144C
145C	 1    b-1 b-1 x*y    1    b*(b-1)   b*(b-1)
146C	--- * sum sum --- = --- * ------- * ------- = 1/4
147C       b^2   x=0 y=1 b^2   b^4      2         2
148C
149C Actually it's a very tiny bit less than 1/4 of course.  If y is fixed,
150C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2.
151
152
153ifdef(`PIC',`
154deflit(UNROLL_THRESHOLD, 9)
155',`
156deflit(UNROLL_THRESHOLD, 6)
157')
158
159defframe(PARAM_CARRY,     20)
160defframe(PARAM_MULTIPLIER,16)
161defframe(PARAM_SIZE,      12)
162defframe(PARAM_SRC,       8)
163defframe(PARAM_DST,       4)
164
165	TEXT
166	ALIGN(32)
167
168PROLOGUE(M4_function_1c)
169	pushl	%esi
170deflit(`FRAME',4)
171	movl	PARAM_CARRY, %esi
172	jmp	L(start_nc)
173EPILOGUE()
174
175PROLOGUE(M4_function_1)
176	push	%esi
177deflit(`FRAME',4)
178	xorl	%esi, %esi	C initial carry
179
180L(start_nc):
181	movl	PARAM_SIZE, %ecx
182	pushl	%ebx
183deflit(`FRAME',8)
184
185	movl	PARAM_SRC, %ebx
186	pushl	%edi
187deflit(`FRAME',12)
188
189	cmpl	$UNROLL_THRESHOLD, %ecx
190	movl	PARAM_DST, %edi
191
192	pushl	%ebp
193deflit(`FRAME',16)
194	jae	L(unroll)
195
196
197	C simple loop
198
199	movl	PARAM_MULTIPLIER, %ebp
200
201L(simple):
202	C eax	scratch
203	C ebx	src
204	C ecx	counter
205	C edx	scratch
206	C esi	carry
207	C edi	dst
208	C ebp	multiplier
209
210	movl	(%ebx), %eax
211	addl	$4, %ebx
212
213	mull	%ebp
214
215	addl	$4, %edi
216	addl	%esi, %eax
217
218	adcl	$0, %edx
219
220	M4_inst	%eax, -4(%edi)
221
222	adcl	$0, %edx
223
224	movl	%edx, %esi
225	loop	L(simple)
226
227
228	popl	%ebp
229	popl	%edi
230
231	popl	%ebx
232	movl	%esi, %eax
233
234	popl	%esi
235	ret
236
237
238
239C -----------------------------------------------------------------------------
240C The unrolled loop uses a "two carry limbs" scheme.  At the top of the loop
241C the carries are ecx=lo, esi=hi, then they swap for each limb processed.
242C For the computed jump an odd size means they start one way around, an even
243C size the other.
244C
245C VAR_JUMP holds the computed jump temporarily because there's not enough
246C registers at the point of doing the mul for the initial two carry limbs.
247C
248C The add/adc for the initial carry in %esi is necessary only for the
249C mpn_addmul/submul_1c entry points.  Duplicating the startup code to
250C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good
251C idea.
252
253dnl  overlapping with parameters already fetched
254define(VAR_COUNTER, `PARAM_SIZE')
255define(VAR_JUMP,    `PARAM_DST')
256
257L(unroll):
258	C eax
259	C ebx	src
260	C ecx	size
261	C edx
262	C esi	initial carry
263	C edi	dst
264	C ebp
265
266	movl	%ecx, %edx
267	decl	%ecx
268
269	subl	$2, %edx
270	negl	%ecx
271
272	shrl	$UNROLL_LOG2, %edx
273	andl	$UNROLL_MASK, %ecx
274
275	movl	%edx, VAR_COUNTER
276	movl	%ecx, %edx
277
278	shll	$4, %edx
279	negl	%ecx
280
281	C 15 code bytes per limb
282ifdef(`PIC',`
283	call	L(pic_calc)
284L(here):
285',`
286	leal	L(entry) (%edx,%ecx,1), %edx
287')
288	movl	(%ebx), %eax		C src low limb
289
290	movl	PARAM_MULTIPLIER, %ebp
291	movl	%edx, VAR_JUMP
292
293	mull	%ebp
294
295	addl	%esi, %eax	C initial carry (from _1c)
296	jadcl0(	%edx)
297
298
299	leal	4(%ebx,%ecx,4), %ebx
300	movl	%edx, %esi	C high carry
301
302	movl	VAR_JUMP, %edx
303	leal	(%edi,%ecx,4), %edi
304
305	testl	$1, %ecx
306	movl	%eax, %ecx	C low carry
307
308	jz	L(noswap)
309	movl	%esi, %ecx	C high,low carry other way around
310
311	movl	%eax, %esi
312L(noswap):
313
314	jmp	*%edx
315
316
317ifdef(`PIC',`
318L(pic_calc):
319	C See mpn/x86/README about old gas bugs
320	leal	(%edx,%ecx,1), %edx
321	addl	$L(entry)-L(here), %edx
322	addl	(%esp), %edx
323	ret_internal
324')
325
326
327C -----------------------------------------------------------
328	ALIGN(32)
329L(top):
330deflit(`FRAME',16)
331	C eax	scratch
332	C ebx	src
333	C ecx	carry lo
334	C edx	scratch
335	C esi	carry hi
336	C edi	dst
337	C ebp	multiplier
338	C
339	C 15 code bytes per limb
340
341	leal	UNROLL_BYTES(%edi), %edi
342
343L(entry):
344forloop(`i', 0, UNROLL_COUNT/2-1, `
345	deflit(`disp0', eval(2*i*4))
346	deflit(`disp1', eval(disp0 + 4))
347
348Zdisp(	movl,	disp0,(%ebx), %eax)
349	mull	%ebp
350Zdisp(	M4_inst,%ecx, disp0,(%edi))
351	adcl	%eax, %esi
352	movl	%edx, %ecx
353	jadcl0(	%ecx)
354
355	movl	disp1(%ebx), %eax
356	mull	%ebp
357	M4_inst	%esi, disp1(%edi)
358	adcl	%eax, %ecx
359	movl	%edx, %esi
360	jadcl0(	%esi)
361')
362
363	decl	VAR_COUNTER
364
365	leal	UNROLL_BYTES(%ebx), %ebx
366	jns	L(top)
367
368
369	popl	%ebp
370	M4_inst	%ecx, UNROLL_BYTES(%edi)
371
372	popl	%edi
373	movl	%esi, %eax
374
375	popl	%ebx
376	jadcl0(	%eax)
377
378	popl	%esi
379	ret
380
381EPILOGUE()
382