1dnl  AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
2
3dnl  Copyright 1999-2003, 2005 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C			    cycles/limb
35C P5
36C P6 model 0-8,10-12		 5.94
37C P6 model 9  (Banias)		 5.51
38C P6 model 13 (Dothan)		 5.57
39C P4 model 0  (Willamette)
40C P4 model 1  (?)
41C P4 model 2  (Northwood)
42C P4 model 3  (Prescott)
43C P4 model 4  (Nocona)
44C AMD K6			7.65-8.5 (data dependent)
45C AMD K7
46C AMD K8
47
48
49dnl  K6:           large multipliers  small multipliers
50dnl  UNROLL_COUNT    cycles/limb       cycles/limb
51dnl        4             9.5              7.78
52dnl        8             9.0              7.78
53dnl       16             8.4              7.65
54dnl       32             8.4              8.2
55dnl
56dnl  Maximum possible unrolling with the current code is 32.
57dnl
58dnl  Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256
59dnl  byte block, which might explain the good speed at that unrolling.
60
61deflit(UNROLL_COUNT, 16)
62
63
64ifdef(`OPERATION_addmul_1', `
65	define(M4_inst,        addl)
66	define(M4_function_1,  mpn_addmul_1)
67	define(M4_function_1c, mpn_addmul_1c)
68',`ifdef(`OPERATION_submul_1', `
69	define(M4_inst,        subl)
70	define(M4_function_1,  mpn_submul_1)
71	define(M4_function_1c, mpn_submul_1c)
72',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
73')')')
74
75MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
76
77
78C mp_limb_t mpn_addmul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
79C                         mp_limb_t mult);
80C mp_limb_t mpn_addmul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
81C                          mp_limb_t mult, mp_limb_t carry);
82C mp_limb_t mpn_submul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
83C                         mp_limb_t mult);
84C mp_limb_t mpn_submul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
85C                          mp_limb_t mult, mp_limb_t carry);
86C
87C The jadcl0()s in the unrolled loop makes the speed data dependent.  Small
88C multipliers (most significant few bits clear) result in few carry bits and
89C speeds up to 7.65 cycles/limb are attained.  Large multipliers (most
90C significant few bits set) make the carry bits 50/50 and lead to something
91C more like 8.4 c/l.  With adcl's both of these would be 9.3 c/l.
92C
93C It's important that the gains for jadcl0 on small multipliers don't come
94C at the cost of slowing down other data.  Tests on uniformly distributed
95C random data, designed to confound branch prediction, show about a 7%
96C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all
97C overheads included).
98C
99C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus
100C 11.0 cycles/limb), and hence isn't used.
101C
102C In the simple loop, note that running ecx from negative to zero and using
103C it as an index in the two movs wouldn't help.  It would save one
104C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired
105C that would be collapsed by this.
106C
107C Attempts at a simpler main loop, with less unrolling, haven't yielded much
108C success, generally running over 9 c/l.
109C
110C
111C jadcl0
112C ------
113C
114C jadcl0() being faster than adcl $0 seems to be an artifact of two things,
115C firstly the instruction decoding and secondly the fact that there's a
116C carry bit for the jadcl0 only on average about 1/4 of the time.
117C
118C The code in the unrolled loop decodes something like the following.
119C
120C                                         decode cycles
121C		mull	%ebp                    2
122C		M4_inst	%esi, disp(%edi)        1
123C		adcl	%eax, %ecx              2
124C		movl	%edx, %esi            \ 1
125C		jnc	1f                    /
126C		incl	%esi                  \ 1
127C	1:	movl	disp(%ebx), %eax      /
128C                                              ---
129C                                               7
130C
131C In a back-to-back style test this measures 7 with the jnc not taken, or 8
132C with it taken (both when correctly predicted).  This is opposite to the
133C measurements showing small multipliers running faster than large ones.
134C Don't really know why.
135C
136C It's not clear how much branch misprediction might be costing.  The K6
137C doco says it will be 1 to 4 cycles, but presumably it's near the low end
138C of that range to get the measured results.
139C
140C
141C In the code the two carries are more or less the preceding mul product and
142C the calculation is roughly
143C
144C	x*y + u*b+v
145C
146C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and
147C v are the two limbs it's added to (being the low of the next mul, and a
148C limb from the destination).
149C
150C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and
151C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of
152C x*y/b^2.  If x, y, u and v are random and uniformly distributed between 0
153C and b-1, then the total probability can be summed over x and y,
154C
155C	 1    b-1 b-1 x*y    1    b*(b-1)   b*(b-1)
156C	--- * sum sum --- = --- * ------- * ------- = 1/4
157C       b^2   x=0 y=1 b^2   b^4      2         2
158C
159C Actually it's a very tiny bit less than 1/4 of course.  If y is fixed,
160C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2.
161
162
163ifdef(`PIC',`
164deflit(UNROLL_THRESHOLD, 9)
165',`
166deflit(UNROLL_THRESHOLD, 6)
167')
168
169defframe(PARAM_CARRY,     20)
170defframe(PARAM_MULTIPLIER,16)
171defframe(PARAM_SIZE,      12)
172defframe(PARAM_SRC,       8)
173defframe(PARAM_DST,       4)
174
175	TEXT
176	ALIGN(32)
177
178PROLOGUE(M4_function_1c)
179	pushl	%esi
180deflit(`FRAME',4)
181	movl	PARAM_CARRY, %esi
182	jmp	L(start_nc)
183EPILOGUE()
184
185PROLOGUE(M4_function_1)
186	push	%esi
187deflit(`FRAME',4)
188	xorl	%esi, %esi	C initial carry
189
190L(start_nc):
191	movl	PARAM_SIZE, %ecx
192	pushl	%ebx
193deflit(`FRAME',8)
194
195	movl	PARAM_SRC, %ebx
196	pushl	%edi
197deflit(`FRAME',12)
198
199	cmpl	$UNROLL_THRESHOLD, %ecx
200	movl	PARAM_DST, %edi
201
202	pushl	%ebp
203deflit(`FRAME',16)
204	jae	L(unroll)
205
206
207	C simple loop
208
209	movl	PARAM_MULTIPLIER, %ebp
210
211L(simple):
212	C eax	scratch
213	C ebx	src
214	C ecx	counter
215	C edx	scratch
216	C esi	carry
217	C edi	dst
218	C ebp	multiplier
219
220	movl	(%ebx), %eax
221	addl	$4, %ebx
222
223	mull	%ebp
224
225	addl	$4, %edi
226	addl	%esi, %eax
227
228	adcl	$0, %edx
229
230	M4_inst	%eax, -4(%edi)
231
232	adcl	$0, %edx
233
234	movl	%edx, %esi
235	loop	L(simple)
236
237
238	popl	%ebp
239	popl	%edi
240
241	popl	%ebx
242	movl	%esi, %eax
243
244	popl	%esi
245	ret
246
247
248
249C -----------------------------------------------------------------------------
250C The unrolled loop uses a "two carry limbs" scheme.  At the top of the loop
251C the carries are ecx=lo, esi=hi, then they swap for each limb processed.
252C For the computed jump an odd size means they start one way around, an even
253C size the other.
254C
255C VAR_JUMP holds the computed jump temporarily because there's not enough
256C registers at the point of doing the mul for the initial two carry limbs.
257C
258C The add/adc for the initial carry in %esi is necessary only for the
259C mpn_addmul/submul_1c entry points.  Duplicating the startup code to
260C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good
261C idea.
262
263dnl  overlapping with parameters already fetched
264define(VAR_COUNTER, `PARAM_SIZE')
265define(VAR_JUMP,    `PARAM_DST')
266
267L(unroll):
268	C eax
269	C ebx	src
270	C ecx	size
271	C edx
272	C esi	initial carry
273	C edi	dst
274	C ebp
275
276	movl	%ecx, %edx
277	decl	%ecx
278
279	subl	$2, %edx
280	negl	%ecx
281
282	shrl	$UNROLL_LOG2, %edx
283	andl	$UNROLL_MASK, %ecx
284
285	movl	%edx, VAR_COUNTER
286	movl	%ecx, %edx
287
288	shll	$4, %edx
289	negl	%ecx
290
291	C 15 code bytes per limb
292ifdef(`PIC',`
293	call	L(pic_calc)
294L(here):
295',`
296	leal	L(entry) (%edx,%ecx,1), %edx
297')
298	movl	(%ebx), %eax		C src low limb
299
300	movl	PARAM_MULTIPLIER, %ebp
301	movl	%edx, VAR_JUMP
302
303	mull	%ebp
304
305	addl	%esi, %eax	C initial carry (from _1c)
306	jadcl0(	%edx)
307
308
309	leal	4(%ebx,%ecx,4), %ebx
310	movl	%edx, %esi	C high carry
311
312	movl	VAR_JUMP, %edx
313	leal	(%edi,%ecx,4), %edi
314
315	testl	$1, %ecx
316	movl	%eax, %ecx	C low carry
317
318	jz	L(noswap)
319	movl	%esi, %ecx	C high,low carry other way around
320
321	movl	%eax, %esi
322L(noswap):
323
324	jmp	*%edx
325
326
327ifdef(`PIC',`
328L(pic_calc):
329	C See mpn/x86/README about old gas bugs
330	leal	(%edx,%ecx,1), %edx
331	addl	$L(entry)-L(here), %edx
332	addl	(%esp), %edx
333	ret_internal
334')
335
336
337C -----------------------------------------------------------
338	ALIGN(32)
339L(top):
340deflit(`FRAME',16)
341	C eax	scratch
342	C ebx	src
343	C ecx	carry lo
344	C edx	scratch
345	C esi	carry hi
346	C edi	dst
347	C ebp	multiplier
348	C
349	C 15 code bytes per limb
350
351	leal	UNROLL_BYTES(%edi), %edi
352
353L(entry):
354forloop(`i', 0, UNROLL_COUNT/2-1, `
355	deflit(`disp0', eval(2*i*4))
356	deflit(`disp1', eval(disp0 + 4))
357
358Zdisp(	movl,	disp0,(%ebx), %eax)
359	mull	%ebp
360Zdisp(	M4_inst,%ecx, disp0,(%edi))
361	adcl	%eax, %esi
362	movl	%edx, %ecx
363	jadcl0(	%ecx)
364
365	movl	disp1(%ebx), %eax
366	mull	%ebp
367	M4_inst	%esi, disp1(%edi)
368	adcl	%eax, %ecx
369	movl	%edx, %esi
370	jadcl0(	%esi)
371')
372
373	decl	VAR_COUNTER
374
375	leal	UNROLL_BYTES(%ebx), %ebx
376	jns	L(top)
377
378
379	popl	%ebp
380	M4_inst	%ecx, UNROLL_BYTES(%edi)
381
382	popl	%edi
383	movl	%esi, %eax
384
385	popl	%ebx
386	jadcl0(	%eax)
387
388	popl	%esi
389	ret
390
391EPILOGUE()
392