1dnl  x86 mpn_mul_basecase -- Multiply two limb vectors and store the result
2dnl  in a third limb vector.
3
4dnl  Copyright 1996-2002 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34
35C     cycles/crossproduct
36C P5	  15
37C P6	   7.5
38C K6	  12.5
39C K7	   5.5
40C P4	  24
41
42
43C void mpn_mul_basecase (mp_ptr wp,
44C                        mp_srcptr xp, mp_size_t xsize,
45C                        mp_srcptr yp, mp_size_t ysize);
46C
47C This was written in a haste since the Pentium optimized code that was used
48C for all x86 machines was slow for the Pentium II.  This code would benefit
49C from some cleanup.
50C
51C To shave off some percentage of the run-time, one should make 4 variants
52C of the Louter loop, for the four different outcomes of un mod 4.  That
53C would avoid Loop0 altogether.  Code expansion would be > 4-fold for that
54C part of the function, but since it is not very large, that would be
55C acceptable.
56C
57C The mul loop (at L(oopM)) might need some tweaking.  It's current speed is
58C unknown.
59
60defframe(PARAM_YSIZE,20)
61defframe(PARAM_YP,   16)
62defframe(PARAM_XSIZE,12)
63defframe(PARAM_XP,   8)
64defframe(PARAM_WP,   4)
65
66defframe(VAR_MULTIPLIER, -4)
67defframe(VAR_COUNTER,    -8)
68deflit(VAR_STACK_SPACE,  8)
69
70	TEXT
71	ALIGN(8)
72
73PROLOGUE(mpn_mul_basecase)
74deflit(`FRAME',0)
75
76	subl	$VAR_STACK_SPACE,%esp
77	pushl	%esi
78	pushl	%ebp
79	pushl	%edi
80deflit(`FRAME',eval(VAR_STACK_SPACE+12))
81
82	movl	PARAM_XP,%esi
83	movl	PARAM_WP,%edi
84	movl	PARAM_YP,%ebp
85
86	movl	(%esi),%eax		C load xp[0]
87	mull	(%ebp)			C multiply by yp[0]
88	movl	%eax,(%edi)		C store to wp[0]
89	movl	PARAM_XSIZE,%ecx	C xsize
90	decl	%ecx			C If xsize = 1, ysize = 1 too
91	jz	L(done)
92
93	pushl	%ebx
94FRAME_pushl()
95	movl	%edx,%ebx
96
97	leal	4(%esi),%esi
98	leal	4(%edi),%edi
99
100L(oopM):
101	movl	(%esi),%eax		C load next limb at xp[j]
102	leal	4(%esi),%esi
103	mull	(%ebp)
104	addl	%ebx,%eax
105	movl	%edx,%ebx
106	adcl	$0,%ebx
107	movl	%eax,(%edi)
108	leal	4(%edi),%edi
109	decl	%ecx
110	jnz	L(oopM)
111
112	movl	%ebx,(%edi)		C most significant limb of product
113	addl	$4,%edi			C increment wp
114	movl	PARAM_XSIZE,%eax
115	shll	$2,%eax
116	subl	%eax,%edi
117	subl	%eax,%esi
118
119	movl	PARAM_YSIZE,%eax	C ysize
120	decl	%eax
121	jz	L(skip)
122	movl	%eax,VAR_COUNTER	C set index i to ysize
123
124L(outer):
125	movl	PARAM_YP,%ebp		C yp
126	addl	$4,%ebp			C make ebp point to next v limb
127	movl	%ebp,PARAM_YP
128	movl	(%ebp),%eax		C copy y limb ...
129	movl	%eax,VAR_MULTIPLIER	C ... to stack slot
130	movl	PARAM_XSIZE,%ecx
131
132	xorl	%ebx,%ebx
133	andl	$3,%ecx
134	jz	L(end0)
135
136L(oop0):
137	movl	(%esi),%eax
138	mull	VAR_MULTIPLIER
139	leal	4(%esi),%esi
140	addl	%ebx,%eax
141	movl	$0,%ebx
142	adcl	%ebx,%edx
143	addl	%eax,(%edi)
144	adcl	%edx,%ebx		C propagate carry into cylimb
145
146	leal	4(%edi),%edi
147	decl	%ecx
148	jnz	L(oop0)
149
150L(end0):
151	movl	PARAM_XSIZE,%ecx
152	shrl	$2,%ecx
153	jz	L(endX)
154
155	ALIGN(8)
156L(oopX):
157	movl	(%esi),%eax
158	mull	VAR_MULTIPLIER
159	addl	%eax,%ebx
160	movl	$0,%ebp
161	adcl	%edx,%ebp
162
163	movl	4(%esi),%eax
164	mull	VAR_MULTIPLIER
165	addl	%ebx,(%edi)
166	adcl	%eax,%ebp	C new lo + cylimb
167	movl	$0,%ebx
168	adcl	%edx,%ebx
169
170	movl	8(%esi),%eax
171	mull	VAR_MULTIPLIER
172	addl	%ebp,4(%edi)
173	adcl	%eax,%ebx	C new lo + cylimb
174	movl	$0,%ebp
175	adcl	%edx,%ebp
176
177	movl	12(%esi),%eax
178	mull	VAR_MULTIPLIER
179	addl	%ebx,8(%edi)
180	adcl	%eax,%ebp	C new lo + cylimb
181	movl	$0,%ebx
182	adcl	%edx,%ebx
183
184	addl	%ebp,12(%edi)
185	adcl	$0,%ebx		C propagate carry into cylimb
186
187	leal	16(%esi),%esi
188	leal	16(%edi),%edi
189	decl	%ecx
190	jnz	L(oopX)
191
192L(endX):
193	movl	%ebx,(%edi)
194	addl	$4,%edi
195
196	C we incremented wp and xp in the loop above; compensate
197	movl	PARAM_XSIZE,%eax
198	shll	$2,%eax
199	subl	%eax,%edi
200	subl	%eax,%esi
201
202	movl	VAR_COUNTER,%eax
203	decl	%eax
204	movl	%eax,VAR_COUNTER
205	jnz	L(outer)
206
207L(skip):
208	popl	%ebx
209	popl	%edi
210	popl	%ebp
211	popl	%esi
212	addl	$8,%esp
213	ret
214
215L(done):
216	movl	%edx,4(%edi)	   C store to wp[1]
217	popl	%edi
218	popl	%ebp
219	popl	%esi
220	addl	$8,%esp
221	ret
222
223EPILOGUE()
224