1dnl  Intel Pentium MMX mpn_mul_1 -- mpn by limb multiplication.
2
3dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C    cycles/limb
24C P5:   12.0   for 32-bit multiplier
25C        7.0   for 16-bit multiplier
26
27
28C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
29C                      mp_limb_t multiplier);
30C
31C When the multiplier is 16 bits some special case MMX code is used.  Small
32C multipliers might arise reasonably often from mpz_mul_ui etc.  If the size
33C is odd there's roughly a 5 cycle penalty, so times for say size==7 and
34C size==8 end up being quite close.  If src isn't aligned to an 8 byte
35C boundary then one limb is processed separately with roughly a 5 cycle
36C penalty, so in that case it's say size==8 and size==9 which are close.
37C
38C Alternatives:
39C
40C MMX is not believed to be of any use for 32-bit multipliers, since for
41C instance the current method would just have to be more or less duplicated
42C for the high and low halves of the multiplier, and would probably
43C therefore run at about 14 cycles, which is slower than the plain integer
44C at 12.
45C
46C Adding the high and low MMX products using integer code seems best.  An
47C attempt at using paddd and carry bit propagation with pcmpgtd didn't give
48C any joy.  Perhaps something could be done keeping the values signed and
49C thereby avoiding adjustments to make pcmpgtd into an unsigned compare, or
50C perhaps not.
51C
52C Future:
53C
54C An mpn_mul_1c entrypoint would need a double carry out of the low result
55C limb in the 16-bit code, unless it could be assumed the carry fits in 16
56C bits, possibly as carry<multiplier, this being true of a big calculation
57C done piece by piece.  But let's worry about that if/when mul_1c is
58C actually used.
59
60defframe(PARAM_MULTIPLIER,16)
61defframe(PARAM_SIZE,      12)
62defframe(PARAM_SRC,       8)
63defframe(PARAM_DST,       4)
64
65	TEXT
66
67	ALIGN(8)
68PROLOGUE(mpn_mul_1)
69deflit(`FRAME',0)
70
71	movl	PARAM_SIZE, %ecx
72	movl	PARAM_SRC, %edx
73
74	cmpl	$1, %ecx
75	jne	L(two_or_more)
76
77	C one limb only
78
79	movl	PARAM_MULTIPLIER, %eax
80	movl	PARAM_DST, %ecx
81
82	mull	(%edx)
83
84	movl	%eax, (%ecx)
85	movl	%edx, %eax
86
87	ret
88
89
90L(two_or_more):
91	C eax	size
92	C ebx
93	C ecx	carry
94	C edx
95	C esi	src
96	C edi
97	C ebp
98
99	pushl	%esi		FRAME_pushl()
100	pushl	%edi		FRAME_pushl()
101
102	movl	%edx, %esi		C src
103	movl	PARAM_DST, %edi
104
105	movl	PARAM_MULTIPLIER, %eax
106	pushl	%ebx		FRAME_pushl()
107
108	leal	(%esi,%ecx,4), %esi	C src end
109	leal	(%edi,%ecx,4), %edi	C dst end
110
111	negl	%ecx			C -size
112
113	pushl	%ebp		FRAME_pushl()
114	cmpl	$65536, %eax
115
116	jb	L(small)
117
118
119L(big):
120	xorl	%ebx, %ebx		C carry limb
121	sarl	%ecx			C -size/2
122
123	jnc	L(top)			C with carry flag clear
124
125
126	C size was odd, process one limb separately
127
128	mull	4(%esi,%ecx,8)		C m * src[0]
129
130	movl	%eax, 4(%edi,%ecx,8)
131	incl	%ecx
132
133	orl	%edx, %ebx		C carry limb, and clear carry flag
134
135
136L(top):
137	C eax
138	C ebx	carry
139	C ecx	counter, negative
140	C edx
141	C esi	src end
142	C edi	dst end
143	C ebp	(scratch carry)
144
145	adcl	$0, %ebx
146	movl	(%esi,%ecx,8), %eax
147
148	mull	PARAM_MULTIPLIER
149
150	movl	%edx, %ebp
151	addl	%eax, %ebx
152
153	adcl	$0, %ebp
154	movl	4(%esi,%ecx,8), %eax
155
156	mull	PARAM_MULTIPLIER
157
158	movl	%ebx, (%edi,%ecx,8)
159	addl	%ebp, %eax
160
161	movl	%eax, 4(%edi,%ecx,8)
162	incl	%ecx
163
164	movl	%edx, %ebx
165	jnz	L(top)
166
167
168	adcl	$0, %ebx
169	popl	%ebp
170
171	movl	%ebx, %eax
172	popl	%ebx
173
174	popl	%edi
175	popl	%esi
176
177	ret
178
179
180L(small):
181	C Special case for 16-bit multiplier.
182	C
183	C eax	multiplier
184	C ebx
185	C ecx	-size
186	C edx	src
187	C esi	src end
188	C edi	dst end
189	C ebp	multiplier
190
191	C size<3 not supported here.  At size==3 we're already a couple of
192	C cycles faster, so there's no threshold as such, just use the MMX
193	C as soon as possible.
194
195	cmpl	$-3, %ecx
196	ja	L(big)
197
198	movd	%eax, %mm7		C m
199	pxor	%mm6, %mm6		C initial carry word
200
201	punpcklwd %mm7, %mm7		C m replicated 2 times
202	addl	$2, %ecx		C -size+2
203
204	punpckldq %mm7, %mm7		C m replicated 4 times
205	andl	$4, %edx		C test alignment, clear carry flag
206
207	movq	%mm7, %mm0		C m
208	jz	L(small_entry)
209
210
211	C Source is unaligned, process one limb separately.
212	C
213	C Plain integer code is used here, since it's smaller and is about
214	C the same 13 cycles as an mmx block would be.
215	C
216	C An "addl $1,%ecx" doesn't clear the carry flag when size==3, hence
217	C the use of separate incl and orl.
218
219	mull	-8(%esi,%ecx,4)		C m * src[0]
220
221	movl	%eax, -8(%edi,%ecx,4)	C dst[0]
222	incl	%ecx			C one limb processed
223
224	movd	%edx, %mm6		C initial carry
225
226	orl	%eax, %eax		C clear carry flag
227	jmp	L(small_entry)
228
229
230C The scheduling here is quite tricky, since so many instructions have
231C pairing restrictions.  In particular the js won't pair with a movd, and
232C can't be paired with an adc since it wants flags from the inc, so
233C instructions are rotated to the top of the loop to find somewhere useful
234C for it.
235C
236C Trouble has been taken to avoid overlapping successive loop iterations,
237C since that would greatly increase the size of the startup and finishup
238C code.  Actually there's probably not much advantage to be had from
239C overlapping anyway, since the difficulties are mostly with pairing, not
240C with latencies as such.
241C
242C In the comments x represents the src data and m the multiplier (16
243C bits, but replicated 4 times).
244C
245C The m signs calculated in %mm3 are a loop invariant and could be held in
246C say %mm5, but that would save only one instruction and hence be no faster.
247
248L(small_top):
249	C eax	l.low, then l.high
250	C ebx	(h.low)
251	C ecx	counter, -size+2 to 0 or 1
252	C edx	(h.high)
253	C esi	&src[size]
254	C edi	&dst[size]
255	C ebp
256	C
257	C %mm0	(high products)
258	C %mm1	(low products)
259	C %mm2	(adjust for m using x signs)
260	C %mm3	(adjust for x using m signs)
261	C %mm4
262	C %mm5
263	C %mm6	h.low, then carry
264	C %mm7	m replicated 4 times
265
266	movd	%mm6, %ebx		C h.low
267	psrlq	$32, %mm1		C l.high
268
269	movd	%mm0, %edx		C h.high
270	movq	%mm0, %mm6		C new c
271
272	adcl	%eax, %ebx
273	incl	%ecx
274
275	movd	%mm1, %eax		C l.high
276	movq	%mm7, %mm0
277
278	adcl	%eax, %edx
279	movl	%ebx, -16(%edi,%ecx,4)
280
281	movl	%edx, -12(%edi,%ecx,4)
282	psrlq	$32, %mm6		C c
283
284L(small_entry):
285	pmulhw	-8(%esi,%ecx,4), %mm0	C h = (x*m).high
286	movq	%mm7, %mm1
287
288	pmullw	-8(%esi,%ecx,4), %mm1	C l = (x*m).low
289	movq	%mm7, %mm3
290
291	movq	-8(%esi,%ecx,4), %mm2	C x
292	psraw	$15, %mm3		C m signs
293
294	pand	-8(%esi,%ecx,4), %mm3	C x selected by m signs
295	psraw	$15, %mm2		C x signs
296
297	paddw	%mm3, %mm0		C add x to h if m neg
298	pand	%mm7, %mm2		C m selected by x signs
299
300	paddw	%mm2, %mm0		C add m to h if x neg
301	incl	%ecx
302
303	movd	%mm1, %eax		C l.low
304	punpcklwd %mm0, %mm6		C c + h.low << 16
305
306	psrlq	$16, %mm0		C h.high
307	js	L(small_top)
308
309
310
311
312	movd	%mm6, %ebx		C h.low
313	psrlq	$32, %mm1		C l.high
314
315	adcl	%eax, %ebx
316	popl	%ebp		FRAME_popl()
317
318	movd	%mm0, %edx		C h.high
319	psrlq	$32, %mm0		C l.high
320
321	movd	%mm1, %eax		C l.high
322
323	adcl	%eax, %edx
324	movl	%ebx, -12(%edi,%ecx,4)
325
326	movd	%mm0, %eax		C c
327
328	adcl	$0, %eax
329	movl	%edx, -8(%edi,%ecx,4)
330
331	orl	%ecx, %ecx
332	jnz	L(small_done)		C final %ecx==1 means even, ==0 odd
333
334
335	C Size odd, one extra limb to process.
336	C Plain integer code is used here, since it's smaller and is about
337	C the same speed as another mmx block would be.
338
339	movl	%eax, %ecx
340	movl	PARAM_MULTIPLIER, %eax
341
342	mull	-4(%esi)
343
344	addl	%ecx, %eax
345
346	adcl	$0, %edx
347	movl	%eax, -4(%edi)
348
349	movl	%edx, %eax
350L(small_done):
351	popl	%ebx
352
353	popl	%edi
354	popl	%esi
355
356	emms
357
358	ret
359
360EPILOGUE()
361