1dnl  Intel Pentium MMX mpn_mul_1 -- mpn by limb multiplication.
2
3dnl  Copyright 2000-2002 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C    cycles/limb
35C P5:   12.0   for 32-bit multiplier
36C        7.0   for 16-bit multiplier
37
38
39C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
40C                      mp_limb_t multiplier);
41C
42C When the multiplier is 16 bits some special case MMX code is used.  Small
43C multipliers might arise reasonably often from mpz_mul_ui etc.  If the size
44C is odd there's roughly a 5 cycle penalty, so times for say size==7 and
45C size==8 end up being quite close.  If src isn't aligned to an 8 byte
46C boundary then one limb is processed separately with roughly a 5 cycle
47C penalty, so in that case it's say size==8 and size==9 which are close.
48C
49C Alternatives:
50C
51C MMX is not believed to be of any use for 32-bit multipliers, since for
52C instance the current method would just have to be more or less duplicated
53C for the high and low halves of the multiplier, and would probably
54C therefore run at about 14 cycles, which is slower than the plain integer
55C at 12.
56C
57C Adding the high and low MMX products using integer code seems best.  An
58C attempt at using paddd and carry bit propagation with pcmpgtd didn't give
59C any joy.  Perhaps something could be done keeping the values signed and
60C thereby avoiding adjustments to make pcmpgtd into an unsigned compare, or
61C perhaps not.
62C
63C Future:
64C
65C An mpn_mul_1c entrypoint would need a double carry out of the low result
66C limb in the 16-bit code, unless it could be assumed the carry fits in 16
67C bits, possibly as carry<multiplier, this being true of a big calculation
68C done piece by piece.  But let's worry about that if/when mul_1c is
69C actually used.
70
71defframe(PARAM_MULTIPLIER,16)
72defframe(PARAM_SIZE,      12)
73defframe(PARAM_SRC,       8)
74defframe(PARAM_DST,       4)
75
76	TEXT
77
78	ALIGN(8)
79PROLOGUE(mpn_mul_1)
80deflit(`FRAME',0)
81
82	movl	PARAM_SIZE, %ecx
83	movl	PARAM_SRC, %edx
84
85	cmpl	$1, %ecx
86	jne	L(two_or_more)
87
88	C one limb only
89
90	movl	PARAM_MULTIPLIER, %eax
91	movl	PARAM_DST, %ecx
92
93	mull	(%edx)
94
95	movl	%eax, (%ecx)
96	movl	%edx, %eax
97
98	ret
99
100
101L(two_or_more):
102	C eax	size
103	C ebx
104	C ecx	carry
105	C edx
106	C esi	src
107	C edi
108	C ebp
109
110	pushl	%esi		FRAME_pushl()
111	pushl	%edi		FRAME_pushl()
112
113	movl	%edx, %esi		C src
114	movl	PARAM_DST, %edi
115
116	movl	PARAM_MULTIPLIER, %eax
117	pushl	%ebx		FRAME_pushl()
118
119	leal	(%esi,%ecx,4), %esi	C src end
120	leal	(%edi,%ecx,4), %edi	C dst end
121
122	negl	%ecx			C -size
123
124	pushl	%ebp		FRAME_pushl()
125	cmpl	$65536, %eax
126
127	jb	L(small)
128
129
130L(big):
131	xorl	%ebx, %ebx		C carry limb
132	sarl	%ecx			C -size/2
133
134	jnc	L(top)			C with carry flag clear
135
136
137	C size was odd, process one limb separately
138
139	mull	4(%esi,%ecx,8)		C m * src[0]
140
141	movl	%eax, 4(%edi,%ecx,8)
142	incl	%ecx
143
144	orl	%edx, %ebx		C carry limb, and clear carry flag
145
146
147L(top):
148	C eax
149	C ebx	carry
150	C ecx	counter, negative
151	C edx
152	C esi	src end
153	C edi	dst end
154	C ebp	(scratch carry)
155
156	adcl	$0, %ebx
157	movl	(%esi,%ecx,8), %eax
158
159	mull	PARAM_MULTIPLIER
160
161	movl	%edx, %ebp
162	addl	%eax, %ebx
163
164	adcl	$0, %ebp
165	movl	4(%esi,%ecx,8), %eax
166
167	mull	PARAM_MULTIPLIER
168
169	movl	%ebx, (%edi,%ecx,8)
170	addl	%ebp, %eax
171
172	movl	%eax, 4(%edi,%ecx,8)
173	incl	%ecx
174
175	movl	%edx, %ebx
176	jnz	L(top)
177
178
179	adcl	$0, %ebx
180	popl	%ebp
181
182	movl	%ebx, %eax
183	popl	%ebx
184
185	popl	%edi
186	popl	%esi
187
188	ret
189
190
191L(small):
192	C Special case for 16-bit multiplier.
193	C
194	C eax	multiplier
195	C ebx
196	C ecx	-size
197	C edx	src
198	C esi	src end
199	C edi	dst end
200	C ebp	multiplier
201
202	C size<3 not supported here.  At size==3 we're already a couple of
203	C cycles faster, so there's no threshold as such, just use the MMX
204	C as soon as possible.
205
206	cmpl	$-3, %ecx
207	ja	L(big)
208
209	movd	%eax, %mm7		C m
210	pxor	%mm6, %mm6		C initial carry word
211
212	punpcklwd %mm7, %mm7		C m replicated 2 times
213	addl	$2, %ecx		C -size+2
214
215	punpckldq %mm7, %mm7		C m replicated 4 times
216	andl	$4, %edx		C test alignment, clear carry flag
217
218	movq	%mm7, %mm0		C m
219	jz	L(small_entry)
220
221
222	C Source is unaligned, process one limb separately.
223	C
224	C Plain integer code is used here, since it's smaller and is about
225	C the same 13 cycles as an mmx block would be.
226	C
227	C An "addl $1,%ecx" doesn't clear the carry flag when size==3, hence
228	C the use of separate incl and orl.
229
230	mull	-8(%esi,%ecx,4)		C m * src[0]
231
232	movl	%eax, -8(%edi,%ecx,4)	C dst[0]
233	incl	%ecx			C one limb processed
234
235	movd	%edx, %mm6		C initial carry
236
237	orl	%eax, %eax		C clear carry flag
238	jmp	L(small_entry)
239
240
241C The scheduling here is quite tricky, since so many instructions have
242C pairing restrictions.  In particular the js won't pair with a movd, and
243C can't be paired with an adc since it wants flags from the inc, so
244C instructions are rotated to the top of the loop to find somewhere useful
245C for it.
246C
247C Trouble has been taken to avoid overlapping successive loop iterations,
248C since that would greatly increase the size of the startup and finishup
249C code.  Actually there's probably not much advantage to be had from
250C overlapping anyway, since the difficulties are mostly with pairing, not
251C with latencies as such.
252C
253C In the comments x represents the src data and m the multiplier (16
254C bits, but replicated 4 times).
255C
256C The m signs calculated in %mm3 are a loop invariant and could be held in
257C say %mm5, but that would save only one instruction and hence be no faster.
258
259L(small_top):
260	C eax	l.low, then l.high
261	C ebx	(h.low)
262	C ecx	counter, -size+2 to 0 or 1
263	C edx	(h.high)
264	C esi	&src[size]
265	C edi	&dst[size]
266	C ebp
267	C
268	C %mm0	(high products)
269	C %mm1	(low products)
270	C %mm2	(adjust for m using x signs)
271	C %mm3	(adjust for x using m signs)
272	C %mm4
273	C %mm5
274	C %mm6	h.low, then carry
275	C %mm7	m replicated 4 times
276
277	movd	%mm6, %ebx		C h.low
278	psrlq	$32, %mm1		C l.high
279
280	movd	%mm0, %edx		C h.high
281	movq	%mm0, %mm6		C new c
282
283	adcl	%eax, %ebx
284	incl	%ecx
285
286	movd	%mm1, %eax		C l.high
287	movq	%mm7, %mm0
288
289	adcl	%eax, %edx
290	movl	%ebx, -16(%edi,%ecx,4)
291
292	movl	%edx, -12(%edi,%ecx,4)
293	psrlq	$32, %mm6		C c
294
295L(small_entry):
296	pmulhw	-8(%esi,%ecx,4), %mm0	C h = (x*m).high
297	movq	%mm7, %mm1
298
299	pmullw	-8(%esi,%ecx,4), %mm1	C l = (x*m).low
300	movq	%mm7, %mm3
301
302	movq	-8(%esi,%ecx,4), %mm2	C x
303	psraw	$15, %mm3		C m signs
304
305	pand	-8(%esi,%ecx,4), %mm3	C x selected by m signs
306	psraw	$15, %mm2		C x signs
307
308	paddw	%mm3, %mm0		C add x to h if m neg
309	pand	%mm7, %mm2		C m selected by x signs
310
311	paddw	%mm2, %mm0		C add m to h if x neg
312	incl	%ecx
313
314	movd	%mm1, %eax		C l.low
315	punpcklwd %mm0, %mm6		C c + h.low << 16
316
317	psrlq	$16, %mm0		C h.high
318	js	L(small_top)
319
320
321
322
323	movd	%mm6, %ebx		C h.low
324	psrlq	$32, %mm1		C l.high
325
326	adcl	%eax, %ebx
327	popl	%ebp		FRAME_popl()
328
329	movd	%mm0, %edx		C h.high
330	psrlq	$32, %mm0		C l.high
331
332	movd	%mm1, %eax		C l.high
333
334	adcl	%eax, %edx
335	movl	%ebx, -12(%edi,%ecx,4)
336
337	movd	%mm0, %eax		C c
338
339	adcl	$0, %eax
340	movl	%edx, -8(%edi,%ecx,4)
341
342	orl	%ecx, %ecx
343	jnz	L(small_done)		C final %ecx==1 means even, ==0 odd
344
345
346	C Size odd, one extra limb to process.
347	C Plain integer code is used here, since it's smaller and is about
348	C the same speed as another mmx block would be.
349
350	movl	%eax, %ecx
351	movl	PARAM_MULTIPLIER, %eax
352
353	mull	-4(%esi)
354
355	addl	%ecx, %eax
356
357	adcl	$0, %edx
358	movl	%eax, -4(%edi)
359
360	movl	%edx, %eax
361L(small_done):
362	popl	%ebx
363
364	popl	%edi
365	popl	%esi
366
367	emms
368
369	ret
370
371EPILOGUE()
372