mul_basecase.asm revision 1.1.1.1
1dnl  x86 mpn_mul_basecase -- Multiply two limb vectors and store the result in
2dnl  a third limb vector.
3
4dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
5dnl
6dnl  Copyright 2011 Free Software Foundation, Inc.
7dnl
8dnl  This file is part of the GNU MP Library.
9dnl
10dnl  The GNU MP Library is free software; you can redistribute it and/or modify
11dnl  it under the terms of the GNU Lesser General Public License as published
12dnl  by the Free Software Foundation; either version 3 of the License, or (at
13dnl  your option) any later version.
14dnl
15dnl  The GNU MP Library is distributed in the hope that it will be useful, but
16dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
17dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
18dnl  License for more details.
19dnl
20dnl  You should have received a copy of the GNU Lesser General Public License
21dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
22
23include(`../config.m4')
24
25C TODO
26C  * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the
27C    4 large loops into one; we could use it for the outer loop branch.
28C  * Optimise code outside of inner loops.
29C  * Write combined addmul_1 feed-in a wind-down code, and use when iterating
30C    outer each loop.  ("Overlapping software pipelining")
31C  * Postpone push of ebx until we know vn > 1.  Perhaps use caller-saves regs
32C    for inlined mul_1, allowing us to postpone all pushes.
33C  * Perhaps write special code for vn <= un < M, for some small M.
34
35C void mpn_mul_basecase (mp_ptr wp,
36C                        mp_srcptr xp, mp_size_t xn,
37C                        mp_srcptr yp, mp_size_t yn);
38C
39
40define(`rp',  `%edi')
41define(`up',  `%esi')
42define(`un',  `%ecx')
43define(`vp',  `%ebp')
44define(`vn',  `36(%esp)')
45
46	TEXT
47	ALIGN(16)
48PROLOGUE(mpn_mul_basecase)
49	push	%edi
50	push	%esi
51	push	%ebx
52	push	%ebp
53	mov	20(%esp), rp
54	mov	24(%esp), up
55	mov	28(%esp), un
56	mov	32(%esp), vp
57
58	movd	(up), %mm0
59	movd	(vp), %mm7
60	pmuludq	%mm7, %mm0
61	pxor	%mm6, %mm6
62
63	mov	un, %eax
64	and	$3, %eax
65	jz	L(of0)
66	cmp	$2, %eax
67	jc	L(of1)
68	jz	L(of2)
69
70C ================================================================
71	jmp	L(m3)
72	ALIGN(16)
73L(lm3):	movd	-4(up), %mm0
74	pmuludq	%mm7, %mm0
75	psrlq	$32, %mm6
76	lea	16(rp), rp
77	paddq	%mm0, %mm6
78	movd	(up), %mm0
79	pmuludq	%mm7, %mm0
80	movd	%mm6, -4(rp)
81	psrlq	$32, %mm6
82L(m3):	paddq	%mm0, %mm6
83	movd	4(up), %mm0
84	pmuludq	%mm7, %mm0
85	movd	%mm6, (rp)
86	psrlq	$32, %mm6
87	paddq	%mm0, %mm6
88	movd	8(up), %mm0
89	pmuludq	%mm7, %mm0
90	movd	%mm6, 4(rp)
91	psrlq	$32, %mm6
92	paddq	%mm0, %mm6
93	sub	$4, un
94	movd	%mm6, 8(rp)
95	lea	16(up), up
96	ja	L(lm3)
97
98	psrlq	$32, %mm6
99	movd	%mm6, 12(rp)
100
101	decl	vn
102	jz	L(done)
103	lea	-8(rp), rp
104
105L(ol3):	mov	28(%esp), un
106	neg	un
107	lea	4(vp), vp
108	movd	(vp), %mm7	C read next V limb
109	mov	24(%esp), up
110	lea	16(rp,un,4), rp
111
112	movd	(up), %mm0
113	pmuludq	%mm7, %mm0
114	sar	$2, un
115	movd	4(up), %mm1
116	movd	%mm0, %ebx
117	pmuludq	%mm7, %mm1
118	lea	-8(up), up
119	xor	%edx, %edx	C zero edx and CF
120	jmp	L(a3)
121
122L(la3):	movd	4(up), %mm1
123	adc	$0, %edx
124	add	%eax, 12(rp)
125	movd	%mm0, %ebx
126	pmuludq	%mm7, %mm1
127	lea	16(rp), rp
128	psrlq	$32, %mm0
129	adc	%edx, %ebx
130	movd	%mm0, %edx
131	movd	%mm1, %eax
132	movd	8(up), %mm0
133	pmuludq	%mm7, %mm0
134	adc	$0, %edx
135	add	%ebx, (rp)
136	psrlq	$32, %mm1
137	adc	%edx, %eax
138	movd	%mm1, %edx
139	movd	%mm0, %ebx
140	movd	12(up), %mm1
141	pmuludq	%mm7, %mm1
142	adc	$0, %edx
143	add	%eax, 4(rp)
144L(a3):	psrlq	$32, %mm0
145	adc	%edx, %ebx
146	movd	%mm0, %edx
147	movd	%mm1, %eax
148	lea	16(up), up
149	movd	(up), %mm0
150	adc	$0, %edx
151	add	%ebx, 8(rp)
152	psrlq	$32, %mm1
153	adc	%edx, %eax
154	movd	%mm1, %edx
155	pmuludq	%mm7, %mm0
156	inc	un
157	jnz	L(la3)
158
159	adc	un, %edx	C un is zero here
160	add	%eax, 12(rp)
161	movd	%mm0, %ebx
162	psrlq	$32, %mm0
163	adc	%edx, %ebx
164	movd	%mm0, %eax
165	adc	un, %eax
166	add	%ebx, 16(rp)
167	adc	un, %eax
168	mov	%eax, 20(rp)
169
170	decl	vn
171	jnz	L(ol3)
172	jmp	L(done)
173
174C ================================================================
175	ALIGN(16)
176L(lm0):	movd	(up), %mm0
177	pmuludq	%mm7, %mm0
178	psrlq	$32, %mm6
179	lea	16(rp), rp
180L(of0):	paddq	%mm0, %mm6
181	movd	4(up), %mm0
182	pmuludq	%mm7, %mm0
183	movd	%mm6, (rp)
184	psrlq	$32, %mm6
185	paddq	%mm0, %mm6
186	movd	8(up), %mm0
187	pmuludq	%mm7, %mm0
188	movd	%mm6, 4(rp)
189	psrlq	$32, %mm6
190	paddq	%mm0, %mm6
191	movd	12(up), %mm0
192	pmuludq	%mm7, %mm0
193	movd	%mm6, 8(rp)
194	psrlq	$32, %mm6
195	paddq	%mm0, %mm6
196	sub	$4, un
197	movd	%mm6, 12(rp)
198	lea	16(up), up
199	ja	L(lm0)
200
201	psrlq	$32, %mm6
202	movd	%mm6, 16(rp)
203
204	decl	vn
205	jz	L(done)
206	lea	-4(rp), rp
207
208L(ol0):	mov	28(%esp), un
209	neg	un
210	lea	4(vp), vp
211	movd	(vp), %mm7	C read next V limb
212	mov	24(%esp), up
213	lea	20(rp,un,4), rp
214
215	movd	(up), %mm1
216	pmuludq	%mm7, %mm1
217	sar	$2, un
218	movd	4(up), %mm0
219	lea	-4(up), up
220	movd	%mm1, %eax
221	pmuludq	%mm7, %mm0
222	xor	%edx, %edx	C zero edx and CF
223	jmp	L(a0)
224
225L(la0):	movd	4(up), %mm1
226	adc	$0, %edx
227	add	%eax, 12(rp)
228	movd	%mm0, %ebx
229	pmuludq	%mm7, %mm1
230	lea	16(rp), rp
231	psrlq	$32, %mm0
232	adc	%edx, %ebx
233	movd	%mm0, %edx
234	movd	%mm1, %eax
235	movd	8(up), %mm0
236	pmuludq	%mm7, %mm0
237	adc	$0, %edx
238	add	%ebx, (rp)
239L(a0):	psrlq	$32, %mm1
240	adc	%edx, %eax
241	movd	%mm1, %edx
242	movd	%mm0, %ebx
243	movd	12(up), %mm1
244	pmuludq	%mm7, %mm1
245	adc	$0, %edx
246	add	%eax, 4(rp)
247	psrlq	$32, %mm0
248	adc	%edx, %ebx
249	movd	%mm0, %edx
250	movd	%mm1, %eax
251	lea	16(up), up
252	movd	(up), %mm0
253	adc	$0, %edx
254	add	%ebx, 8(rp)
255	psrlq	$32, %mm1
256	adc	%edx, %eax
257	movd	%mm1, %edx
258	pmuludq	%mm7, %mm0
259	inc	un
260	jnz	L(la0)
261
262	adc	un, %edx	C un is zero here
263	add	%eax, 12(rp)
264	movd	%mm0, %ebx
265	psrlq	$32, %mm0
266	adc	%edx, %ebx
267	movd	%mm0, %eax
268	adc	un, %eax
269	add	%ebx, 16(rp)
270	adc	un, %eax
271	mov	%eax, 20(rp)
272
273	decl	vn
274	jnz	L(ol0)
275	jmp	L(done)
276
277C ================================================================
278	ALIGN(16)
279L(lm1):	movd	-12(up), %mm0
280	pmuludq	%mm7, %mm0
281	psrlq	$32, %mm6
282	lea	16(rp), rp
283	paddq	%mm0, %mm6
284	movd	-8(up), %mm0
285	pmuludq	%mm7, %mm0
286	movd	%mm6, -12(rp)
287	psrlq	$32, %mm6
288	paddq	%mm0, %mm6
289	movd	-4(up), %mm0
290	pmuludq	%mm7, %mm0
291	movd	%mm6, -8(rp)
292	psrlq	$32, %mm6
293	paddq	%mm0, %mm6
294	movd	(up), %mm0
295	pmuludq	%mm7, %mm0
296	movd	%mm6, -4(rp)
297	psrlq	$32, %mm6
298L(of1):	paddq	%mm0, %mm6
299	sub	$4, un
300	movd	%mm6, (rp)
301	lea	16(up), up
302	ja	L(lm1)
303
304	psrlq	$32, %mm6
305	movd	%mm6, 4(rp)
306
307	decl	vn
308	jz	L(done)
309	lea	-16(rp), rp
310
311L(ol1):	mov	28(%esp), un
312	neg	un
313	lea	4(vp), vp
314	movd	(vp), %mm7	C read next V limb
315	mov	24(%esp), up
316	lea	24(rp,un,4), rp
317
318	movd	(up), %mm0
319	pmuludq	%mm7, %mm0
320	sar	$2, un
321	movd	%mm0, %ebx
322	movd	4(up), %mm1
323	pmuludq	%mm7, %mm1
324	xor	%edx, %edx	C zero edx and CF
325	inc	un
326	jmp	L(a1)
327
328L(la1):	movd	4(up), %mm1
329	adc	$0, %edx
330	add	%eax, 12(rp)
331	movd	%mm0, %ebx
332	pmuludq	%mm7, %mm1
333	lea	16(rp), rp
334L(a1):	psrlq	$32, %mm0
335	adc	%edx, %ebx
336	movd	%mm0, %edx
337	movd	%mm1, %eax
338	movd	8(up), %mm0
339	pmuludq	%mm7, %mm0
340	adc	$0, %edx
341	add	%ebx, (rp)
342	psrlq	$32, %mm1
343	adc	%edx, %eax
344	movd	%mm1, %edx
345	movd	%mm0, %ebx
346	movd	12(up), %mm1
347	pmuludq	%mm7, %mm1
348	adc	$0, %edx
349	add	%eax, 4(rp)
350	psrlq	$32, %mm0
351	adc	%edx, %ebx
352	movd	%mm0, %edx
353	movd	%mm1, %eax
354	lea	16(up), up
355	movd	(up), %mm0
356	adc	$0, %edx
357	add	%ebx, 8(rp)
358	psrlq	$32, %mm1
359	adc	%edx, %eax
360	movd	%mm1, %edx
361	pmuludq	%mm7, %mm0
362	inc	un
363	jnz	L(la1)
364
365	adc	un, %edx	C un is zero here
366	add	%eax, 12(rp)
367	movd	%mm0, %ebx
368	psrlq	$32, %mm0
369	adc	%edx, %ebx
370	movd	%mm0, %eax
371	adc	un, %eax
372	add	%ebx, 16(rp)
373	adc	un, %eax
374	mov	%eax, 20(rp)
375
376	decl	vn
377	jnz	L(ol1)
378	jmp	L(done)
379
380C ================================================================
381	ALIGN(16)
382L(lm2):	movd	-8(up), %mm0
383	pmuludq	%mm7, %mm0
384	psrlq	$32, %mm6
385	lea	16(rp), rp
386	paddq	%mm0, %mm6
387	movd	-4(up), %mm0
388	pmuludq	%mm7, %mm0
389	movd	%mm6, -8(rp)
390	psrlq	$32, %mm6
391	paddq	%mm0, %mm6
392	movd	(up), %mm0
393	pmuludq	%mm7, %mm0
394	movd	%mm6, -4(rp)
395	psrlq	$32, %mm6
396L(of2):	paddq	%mm0, %mm6
397	movd	4(up), %mm0
398	pmuludq	%mm7, %mm0
399	movd	%mm6, (rp)
400	psrlq	$32, %mm6
401	paddq	%mm0, %mm6
402	sub	$4, un
403	movd	%mm6, 4(rp)
404	lea	16(up), up
405	ja	L(lm2)
406
407	psrlq	$32, %mm6
408	movd	%mm6, 8(rp)
409
410	decl	vn
411	jz	L(done)
412	lea	-12(rp), rp
413
414L(ol2):	mov	28(%esp), un
415	neg	un
416	lea	4(vp), vp
417	movd	(vp), %mm7	C read next V limb
418	mov	24(%esp), up
419	lea	12(rp,un,4), rp
420
421	movd	(up), %mm1
422	pmuludq	%mm7, %mm1
423	sar	$2, un
424	movd	4(up), %mm0
425	lea	4(up), up
426	movd	%mm1, %eax
427	xor	%edx, %edx	C zero edx and CF
428	jmp	L(lo2)
429
430L(la2):	movd	4(up), %mm1
431	adc	$0, %edx
432	add	%eax, 12(rp)
433	movd	%mm0, %ebx
434	pmuludq	%mm7, %mm1
435	lea	16(rp), rp
436	psrlq	$32, %mm0
437	adc	%edx, %ebx
438	movd	%mm0, %edx
439	movd	%mm1, %eax
440	movd	8(up), %mm0
441	pmuludq	%mm7, %mm0
442	adc	$0, %edx
443	add	%ebx, (rp)
444	psrlq	$32, %mm1
445	adc	%edx, %eax
446	movd	%mm1, %edx
447	movd	%mm0, %ebx
448	movd	12(up), %mm1
449	pmuludq	%mm7, %mm1
450	adc	$0, %edx
451	add	%eax, 4(rp)
452	psrlq	$32, %mm0
453	adc	%edx, %ebx
454	movd	%mm0, %edx
455	movd	%mm1, %eax
456	lea	16(up), up
457	movd	(up), %mm0
458	adc	$0, %edx
459	add	%ebx, 8(rp)
460L(lo2):	psrlq	$32, %mm1
461	adc	%edx, %eax
462	movd	%mm1, %edx
463	pmuludq	%mm7, %mm0
464	inc	un
465	jnz	L(la2)
466
467	adc	un, %edx	C un is zero here
468	add	%eax, 12(rp)
469	movd	%mm0, %ebx
470	psrlq	$32, %mm0
471	adc	%edx, %ebx
472	movd	%mm0, %eax
473	adc	un, %eax
474	add	%ebx, 16(rp)
475	adc	un, %eax
476	mov	%eax, 20(rp)
477
478	decl	vn
479	jnz	L(ol2)
480C	jmp	L(done)
481
482C ================================================================
483L(done):
484	emms
485	pop	%ebp
486	pop	%ebx
487	pop	%esi
488	pop	%edi
489	ret
490EPILOGUE()
491