1dnl  SPARC v9 32-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
2dnl  the result in a second limb vector.
3
4dnl  Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C Algorithm: We use two floating-point multiplies per limb product, with the
24C invariant v operand split into two 16-bit pieces, and the u operand split
25C into 32-bit pieces.  We convert the two 48-bit products and transfer them to
26C the integer unit.
27
28C		   cycles/limb
29C UltraSPARC 1&2:     6.5
30C UltraSPARC 3:	      ?
31
32C Possible optimizations:
33C   1. Combine 32-bit memory operations into 64-bit operations.  Since we're
34C      memory bandwidth limited, this could save 1.5 cycles/limb.
35C   2. Unroll the inner loop.  Since we already use alternate temporary areas,
36C      it is very straightforward to unroll, using an exit branch midways.
37C      Unrolling would allow deeper scheduling which could improve speed for L2
38C      cache case.
39C   3. For mpn_mul_1: Use more alternating temp areas.  The std'es and ldx'es
40C      aren't sufficiently apart-scheduled with just two temp areas.
41C   4. Specialize for particular v values.  If its upper 16 bits are zero, we
42C      could save many operations.
43
44C INPUT PARAMETERS
45C rp	i0
46C up	i1
47C n	i2
48C v	i3
49
50define(`FSIZE',224)
51
52ASM_START()
53PROLOGUE(mpn_mul_1)
54	add	%sp, -FSIZE, %sp
55	sethi	%hi(0xffff), %g1
56	srl	%o3, 16, %g2
57	or	%g1, %lo(0xffff), %g1
58	and	%o3, %g1, %g1
59	stx	%g1, [%sp+104]
60	stx	%g2, [%sp+112]
61	ldd	[%sp+104], %f6
62	ldd	[%sp+112], %f8
63	fxtod	%f6, %f6
64	fxtod	%f8, %f8
65	ld	[%sp+104], %f10		C zero f10
66
67	mov	0, %g3			C cy = 0
68
69define(`fanop', `fitod %f18, %f0')	C  A quasi nop running in the FA pipe
70
71	add	%sp, 160, %o5		C point in scratch area
72	and	%o5, -32, %o5		C align at 0 (mod 32) in scratch area
73
74	subcc	%o2, 1, %o2
75	ld	[%o1], %f11		C read up[i]
76	add	%o1, 4, %o1		C up++
77	bne,pt	%icc, .L_two_or_more
78	fxtod	%f10, %f2
79
80	fmuld	%f2, %f8, %f16
81	fmuld	%f2, %f6, %f4
82	fdtox	%f16, %f14
83	fdtox	%f4, %f12
84	std	%f14, [%o5+16]
85	std	%f12, [%o5+24]
86	ldx	[%o5+16], %g2		C p16
87	ldx	[%o5+24], %g1		C p0
88	b	.L1
89	add	%o0, -16, %o0
90
91	.align	16
92.L_two_or_more:
93	subcc	%o2, 1, %o2
94	ld	[%o1], %f11		C read up[i]
95	fmuld	%f2, %f8, %f16
96	fmuld	%f2, %f6, %f4
97	add	%o1, 4, %o1		C up++
98	bne,pt	%icc, .L_three_or_more
99	fxtod	%f10, %f2
100
101	fdtox	%f16, %f14
102	fdtox	%f4, %f12
103	std	%f14, [%o5+16]
104	fmuld	%f2, %f8, %f16
105	std	%f12, [%o5+24]
106	fmuld	%f2, %f6, %f4
107	fdtox	%f16, %f14
108	fdtox	%f4, %f12
109	std	%f14, [%o5+0]
110	std	%f12, [%o5+8]
111	ldx	[%o5+16], %g2		C p16
112	ldx	[%o5+24], %g1		C p0
113	b	.L2
114	add	%o0, -12, %o0
115
116	.align	16
117.L_three_or_more:
118	subcc	%o2, 1, %o2
119	ld	[%o1], %f11		C read up[i]
120	fdtox	%f16, %f14
121	fdtox	%f4, %f12
122	std	%f14, [%o5+16]
123	fmuld	%f2, %f8, %f16
124	std	%f12, [%o5+24]
125	fmuld	%f2, %f6, %f4
126	add	%o1, 4, %o1		C up++
127	bne,pt	%icc, .L_four_or_more
128	fxtod	%f10, %f2
129
130	fdtox	%f16, %f14
131	fdtox	%f4, %f12
132	std	%f14, [%o5+0]
133	fmuld	%f2, %f8, %f16
134	std	%f12, [%o5+8]
135	fmuld	%f2, %f6, %f4
136	fdtox	%f16, %f14
137	ldx	[%o5+16], %g2		C p16
138	fdtox	%f4, %f12
139	ldx	[%o5+24], %g1		C p0
140	std	%f14, [%o5+16]
141	std	%f12, [%o5+24]
142	b	.L3
143	add	%o0, -8, %o0
144
145	.align	16
146.L_four_or_more:
147	subcc	%o2, 1, %o2
148	ld	[%o1], %f11		C read up[i]
149	fdtox	%f16, %f14
150	fdtox	%f4, %f12
151	std	%f14, [%o5+0]
152	fmuld	%f2, %f8, %f16
153	std	%f12, [%o5+8]
154	fmuld	%f2, %f6, %f4
155	add	%o1, 4, %o1		C up++
156	bne,pt	%icc, .L_five_or_more
157	fxtod	%f10, %f2
158
159	fdtox	%f16, %f14
160	ldx	[%o5+16], %g2		C p16
161	fdtox	%f4, %f12
162	ldx	[%o5+24], %g1		C p0
163	std	%f14, [%o5+16]
164	fmuld	%f2, %f8, %f16
165	std	%f12, [%o5+24]
166	fmuld	%f2, %f6, %f4
167	add	%o1, 4, %o1		C up++
168	b	.L4
169	add	%o0, -4, %o0
170
171	.align	16
172.L_five_or_more:
173	subcc	%o2, 1, %o2
174	ld	[%o1], %f11		C read up[i]
175	fdtox	%f16, %f14
176	ldx	[%o5+16], %g2		C p16
177	fdtox	%f4, %f12
178	ldx	[%o5+24], %g1		C p0
179	std	%f14, [%o5+16]
180	fmuld	%f2, %f8, %f16
181	std	%f12, [%o5+24]
182	fmuld	%f2, %f6, %f4
183	add	%o1, 4, %o1		C up++
184	bne,pt	%icc, .Loop
185	fxtod	%f10, %f2
186	b,a	.L5
187
188C BEGIN MAIN LOOP
189	.align 16
190C -- 0
191.Loop:	nop
192	subcc	%o2, 1, %o2
193	ld	[%o1], %f11		C read up[i]
194	fdtox	%f16, %f14
195C -- 1
196	sllx	%g2, 16, %g4		C (p16 << 16)
197	add	%o0, 4, %o0		C rp++
198	ldx	[%o5+0], %g2		C p16
199	fdtox	%f4, %f12
200C -- 2
201	nop
202	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
203	ldx	[%o5+8], %g1		C p0
204	fanop
205C -- 3
206	nop
207	add	%g3, %g4, %g4		C p += cy
208	std	%f14, [%o5+0]
209	fmuld	%f2, %f8, %f16
210C -- 4
211	srlx	%g4, 32, %g3		C new cy
212	add	%o1, 4, %o1		C up++
213	std	%f12, [%o5+8]
214	fmuld	%f2, %f6, %f4
215C -- 5
216	xor	%o5, 16, %o5		C alternate scratch variables
217	stw	%g4, [%o0-4]
218	bne,pt	%icc, .Loop
219	fxtod	%f10, %f2
220C END MAIN LOOP
221
222.L5:	fdtox	%f16, %f14
223	sllx	%g2, 16, %g4		C (p16 << 16)
224	ldx	[%o5+0], %g2		C p16
225	fdtox	%f4, %f12
226	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
227	ldx	[%o5+8], %g1		C p0
228	add	%g4, %g3, %g4		C p += cy
229	std	%f14, [%o5+0]
230	fmuld	%f2, %f8, %f16
231	std	%f12, [%o5+8]
232	fmuld	%f2, %f6, %f4
233	xor	%o5, 16, %o5
234	stw	%g4, [%o0+0]
235	srlx	%g4, 32, %g3		C new cy
236
237.L4:	fdtox	%f16, %f14
238	sllx	%g2, 16, %g4		C (p16 << 16)
239	ldx	[%o5+0], %g2		C p16
240	fdtox	%f4, %f12
241	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
242	ldx	[%o5+8], %g1		C p0
243	add	%g3, %g4, %g4		C p += cy
244	std	%f14, [%o5+0]
245	std	%f12, [%o5+8]
246	xor	%o5, 16, %o5
247	stw	%g4, [%o0+4]
248	srlx	%g4, 32, %g3		C new cy
249
250.L3:	sllx	%g2, 16, %g4		C (p16 << 16)
251	ldx	[%o5+0], %g2		C p16
252	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
253	ldx	[%o5+8], %g1		C p0
254	add	%g3, %g4, %g4		C p += cy
255	xor	%o5, 16, %o5
256	stw	%g4, [%o0+8]
257	srlx	%g4, 32, %g3		C new cy
258
259.L2:	sllx	%g2, 16, %g4		C (p16 << 16)
260	ldx	[%o5+0], %g2		C p16
261	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
262	ldx	[%o5+8], %g1		C p0
263	add	%g3, %g4, %g4		C p += cy
264	stw	%g4, [%o0+12]
265	srlx	%g4, 32, %g3		C new cy
266
267.L1:	sllx	%g2, 16, %g4		C (p16 << 16)
268	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
269	add	%g3, %g4, %g4		C p += cy
270	stw	%g4, [%o0+16]
271	srlx	%g4, 32, %g3		C new cy
272
273	mov	%g3, %o0
274	retl
275	sub	%sp, -FSIZE, %sp
276EPILOGUE(mpn_mul_1)
277