1dnl  AMD K6 mpn_mul_1 -- mpn by limb multiply.
2
3dnl  Copyright 1999, 2000, 2002, 2005 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C                           cycles/limb
24C P5:
25C P6 model 0-8,10-12)            5.5
26C P6 model 9  (Banias)
27C P6 model 13 (Dothan)           4.87
28C P4 model 0  (Willamette)
29C P4 model 1  (?)
30C P4 model 2  (Northwood)
31C P4 model 3  (Prescott)
32C P4 model 4  (Nocona)
33C K6:                            6.25
34C K7:
35C K8:
36
37
38C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
39C                      mp_limb_t multiplier);
40C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
41C                       mp_limb_t multiplier, mp_limb_t carry);
42C
43C Multiply src,size by mult and store the result in dst,size.
44C Return the carry limb from the top of the result.
45C
46C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
47C the low limb of the result.
48
49defframe(PARAM_CARRY,     20)
50defframe(PARAM_MULTIPLIER,16)
51defframe(PARAM_SIZE,      12)
52defframe(PARAM_SRC,       8)
53defframe(PARAM_DST,       4)
54
55dnl  minimum 5 because the unrolled code can't handle less
56deflit(UNROLL_THRESHOLD, 5)
57
58	TEXT
59	ALIGN(32)
60
61PROLOGUE(mpn_mul_1c)
62	pushl	%esi
63deflit(`FRAME',4)
64	movl	PARAM_CARRY, %esi
65	jmp	L(start_nc)
66EPILOGUE()
67
68
69PROLOGUE(mpn_mul_1)
70	push	%esi
71deflit(`FRAME',4)
72	xorl	%esi, %esi	C initial carry
73
74L(start_nc):
75	mov	PARAM_SIZE, %ecx
76	push	%ebx
77FRAME_pushl()
78
79	movl	PARAM_SRC, %ebx
80	push	%edi
81FRAME_pushl()
82
83	movl	PARAM_DST, %edi
84	pushl	%ebp
85FRAME_pushl()
86
87	cmpl	$UNROLL_THRESHOLD, %ecx
88	movl	PARAM_MULTIPLIER, %ebp
89
90	jae	L(unroll)
91
92
93	C code offset 0x22 here, close enough to aligned
94L(simple):
95	C eax	scratch
96	C ebx	src
97	C ecx	counter
98	C edx	scratch
99	C esi	carry
100	C edi	dst
101	C ebp	multiplier
102	C
103	C this loop 8 cycles/limb
104
105	movl	(%ebx), %eax
106	addl	$4, %ebx
107
108	mull	%ebp
109
110	addl	%esi, %eax
111	movl	$0, %esi
112
113	adcl	%edx, %esi
114
115	movl	%eax, (%edi)
116	addl	$4, %edi
117
118	loop	L(simple)
119
120
121	popl	%ebp
122
123	popl	%edi
124	popl	%ebx
125
126	movl	%esi, %eax
127	popl	%esi
128
129	ret
130
131
132C -----------------------------------------------------------------------------
133C The code for each limb is 6 cycles, with instruction decoding being the
134C limiting factor.  At 4 limbs/loop and 1 cycle/loop of overhead it's 6.25
135C cycles/limb in total.
136C
137C The secret ingredient to get 6.25 is to start the loop with the mul and
138C have the load/store pair at the end.  Rotating the load/store to the top
139C is an 0.5 c/l slowdown.  (Some address generation effect probably.)
140C
141C The whole unrolled loop fits nicely in exactly 80 bytes.
142
143
144	ALIGN(16)	C already aligned to 16 here actually
145L(unroll):
146	movl	(%ebx), %eax
147	leal	-16(%ebx,%ecx,4), %ebx
148
149	leal	-16(%edi,%ecx,4), %edi
150	subl	$4, %ecx
151
152	negl	%ecx
153
154
155	ALIGN(16)	C one byte nop for this alignment
156L(top):
157	C eax	scratch
158	C ebx	&src[size-4]
159	C ecx	counter
160	C edx	scratch
161	C esi	carry
162	C edi	&dst[size-4]
163	C ebp	multiplier
164
165	mull	%ebp
166
167	addl	%esi, %eax
168	movl	$0, %esi
169
170	adcl	%edx, %esi
171
172	movl	%eax, (%edi,%ecx,4)
173	movl	4(%ebx,%ecx,4), %eax
174
175
176	mull	%ebp
177
178	addl	%esi, %eax
179	movl	$0, %esi
180
181	adcl	%edx, %esi
182
183	movl	%eax, 4(%edi,%ecx,4)
184	movl	8(%ebx,%ecx,4), %eax
185
186
187	mull	%ebp
188
189	addl	%esi, %eax
190	movl	$0, %esi
191
192	adcl	%edx, %esi
193
194	movl	%eax, 8(%edi,%ecx,4)
195	movl	12(%ebx,%ecx,4), %eax
196
197
198	mull	%ebp
199
200	addl	%esi, %eax
201	movl	$0, %esi
202
203	adcl	%edx, %esi
204
205	movl	%eax, 12(%edi,%ecx,4)
206	movl	16(%ebx,%ecx,4), %eax
207
208
209	addl	$4, %ecx
210	js	L(top)
211
212
213
214	C eax	next src limb
215	C ebx	&src[size-4]
216	C ecx	0 to 3 representing respectively 4 to 1 further limbs
217	C edx
218	C esi	carry
219	C edi	&dst[size-4]
220
221	testb	$2, %cl
222	jnz	L(finish_not_two)
223
224	mull	%ebp
225
226	addl	%esi, %eax
227	movl	$0, %esi
228
229	adcl	%edx, %esi
230
231	movl	%eax, (%edi,%ecx,4)
232	movl	4(%ebx,%ecx,4), %eax
233
234
235	mull	%ebp
236
237	addl	%esi, %eax
238	movl	$0, %esi
239
240	adcl	%edx, %esi
241
242	movl	%eax, 4(%edi,%ecx,4)
243	movl	8(%ebx,%ecx,4), %eax
244
245	addl	$2, %ecx
246L(finish_not_two):
247
248
249	testb	$1, %cl
250	jnz	L(finish_not_one)
251
252	mull	%ebp
253
254	addl	%esi, %eax
255	movl	$0, %esi
256
257	adcl	%edx, %esi
258
259	movl	%eax, 8(%edi)
260	movl	12(%ebx), %eax
261L(finish_not_one):
262
263
264	mull	%ebp
265
266	addl	%esi, %eax
267	popl	%ebp
268
269	adcl	$0, %edx
270
271	movl	%eax, 12(%edi)
272	popl	%edi
273
274	popl	%ebx
275	movl	%edx, %eax
276
277	popl	%esi
278
279	ret
280
281EPILOGUE()
282