1dnl  AMD K6 mpn_mul_1 -- mpn by limb multiply.
2
3dnl  Copyright 1999, 2000, 2002, 2005 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C			    cycles/limb
35C P5
36C P6 model 0-8,10-12		 5.5
37C P6 model 9  (Banias)
38C P6 model 13 (Dothan)		 4.87
39C P4 model 0  (Willamette)
40C P4 model 1  (?)
41C P4 model 2  (Northwood)
42C P4 model 3  (Prescott)
43C P4 model 4  (Nocona)
44C AMD K6			 6.25
45C AMD K7
46C AMD K8
47
48
49C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
50C                      mp_limb_t multiplier);
51C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
52C                       mp_limb_t multiplier, mp_limb_t carry);
53C
54C Multiply src,size by mult and store the result in dst,size.
55C Return the carry limb from the top of the result.
56C
57C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
58C the low limb of the result.
59
60defframe(PARAM_CARRY,     20)
61defframe(PARAM_MULTIPLIER,16)
62defframe(PARAM_SIZE,      12)
63defframe(PARAM_SRC,       8)
64defframe(PARAM_DST,       4)
65
66dnl  minimum 5 because the unrolled code can't handle less
67deflit(UNROLL_THRESHOLD, 5)
68
69	TEXT
70	ALIGN(32)
71
72PROLOGUE(mpn_mul_1c)
73	pushl	%esi
74deflit(`FRAME',4)
75	movl	PARAM_CARRY, %esi
76	jmp	L(start_nc)
77EPILOGUE()
78
79
80PROLOGUE(mpn_mul_1)
81	push	%esi
82deflit(`FRAME',4)
83	xorl	%esi, %esi	C initial carry
84
85L(start_nc):
86	mov	PARAM_SIZE, %ecx
87	push	%ebx
88FRAME_pushl()
89
90	movl	PARAM_SRC, %ebx
91	push	%edi
92FRAME_pushl()
93
94	movl	PARAM_DST, %edi
95	pushl	%ebp
96FRAME_pushl()
97
98	cmpl	$UNROLL_THRESHOLD, %ecx
99	movl	PARAM_MULTIPLIER, %ebp
100
101	jae	L(unroll)
102
103
104	C code offset 0x22 here, close enough to aligned
105L(simple):
106	C eax	scratch
107	C ebx	src
108	C ecx	counter
109	C edx	scratch
110	C esi	carry
111	C edi	dst
112	C ebp	multiplier
113	C
114	C this loop 8 cycles/limb
115
116	movl	(%ebx), %eax
117	addl	$4, %ebx
118
119	mull	%ebp
120
121	addl	%esi, %eax
122	movl	$0, %esi
123
124	adcl	%edx, %esi
125
126	movl	%eax, (%edi)
127	addl	$4, %edi
128
129	loop	L(simple)
130
131
132	popl	%ebp
133
134	popl	%edi
135	popl	%ebx
136
137	movl	%esi, %eax
138	popl	%esi
139
140	ret
141
142
143C -----------------------------------------------------------------------------
144C The code for each limb is 6 cycles, with instruction decoding being the
145C limiting factor.  At 4 limbs/loop and 1 cycle/loop of overhead it's 6.25
146C cycles/limb in total.
147C
148C The secret ingredient to get 6.25 is to start the loop with the mul and
149C have the load/store pair at the end.  Rotating the load/store to the top
150C is an 0.5 c/l slowdown.  (Some address generation effect probably.)
151C
152C The whole unrolled loop fits nicely in exactly 80 bytes.
153
154
155	ALIGN(16)	C already aligned to 16 here actually
156L(unroll):
157	movl	(%ebx), %eax
158	leal	-16(%ebx,%ecx,4), %ebx
159
160	leal	-16(%edi,%ecx,4), %edi
161	subl	$4, %ecx
162
163	negl	%ecx
164
165
166	ALIGN(16)	C one byte nop for this alignment
167L(top):
168	C eax	scratch
169	C ebx	&src[size-4]
170	C ecx	counter
171	C edx	scratch
172	C esi	carry
173	C edi	&dst[size-4]
174	C ebp	multiplier
175
176	mull	%ebp
177
178	addl	%esi, %eax
179	movl	$0, %esi
180
181	adcl	%edx, %esi
182
183	movl	%eax, (%edi,%ecx,4)
184	movl	4(%ebx,%ecx,4), %eax
185
186
187	mull	%ebp
188
189	addl	%esi, %eax
190	movl	$0, %esi
191
192	adcl	%edx, %esi
193
194	movl	%eax, 4(%edi,%ecx,4)
195	movl	8(%ebx,%ecx,4), %eax
196
197
198	mull	%ebp
199
200	addl	%esi, %eax
201	movl	$0, %esi
202
203	adcl	%edx, %esi
204
205	movl	%eax, 8(%edi,%ecx,4)
206	movl	12(%ebx,%ecx,4), %eax
207
208
209	mull	%ebp
210
211	addl	%esi, %eax
212	movl	$0, %esi
213
214	adcl	%edx, %esi
215
216	movl	%eax, 12(%edi,%ecx,4)
217	movl	16(%ebx,%ecx,4), %eax
218
219
220	addl	$4, %ecx
221	js	L(top)
222
223
224
225	C eax	next src limb
226	C ebx	&src[size-4]
227	C ecx	0 to 3 representing respectively 4 to 1 further limbs
228	C edx
229	C esi	carry
230	C edi	&dst[size-4]
231
232	testb	$2, %cl
233	jnz	L(finish_not_two)
234
235	mull	%ebp
236
237	addl	%esi, %eax
238	movl	$0, %esi
239
240	adcl	%edx, %esi
241
242	movl	%eax, (%edi,%ecx,4)
243	movl	4(%ebx,%ecx,4), %eax
244
245
246	mull	%ebp
247
248	addl	%esi, %eax
249	movl	$0, %esi
250
251	adcl	%edx, %esi
252
253	movl	%eax, 4(%edi,%ecx,4)
254	movl	8(%ebx,%ecx,4), %eax
255
256	addl	$2, %ecx
257L(finish_not_two):
258
259
260	testb	$1, %cl
261	jnz	L(finish_not_one)
262
263	mull	%ebp
264
265	addl	%esi, %eax
266	movl	$0, %esi
267
268	adcl	%edx, %esi
269
270	movl	%eax, 8(%edi)
271	movl	12(%ebx), %eax
272L(finish_not_one):
273
274
275	mull	%ebp
276
277	addl	%esi, %eax
278	popl	%ebp
279
280	adcl	$0, %edx
281
282	movl	%eax, 12(%edi)
283	popl	%edi
284
285	popl	%ebx
286	movl	%edx, %eax
287
288	popl	%esi
289
290	ret
291
292EPILOGUE()
293