1dnl  SPARC v9 64-bit mpn_addmul_2 -- Multiply an n limb number with 2-limb
2dnl  number and add the result to a n limb vector.
3
4dnl  Copyright 2002, 2003 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C                  cycles/limb
24C UltraSPARC 1&2:      9
25C UltraSPARC 3:       10
26
27C Algorithm: We use 16 floating-point multiplies per limb product, with the
28C 2-limb v operand split into eight 16-bit pieces, and the n-limb u operand
29C split into 32-bit pieces.  We sum four 48-bit partial products using
30C floating-point add, then convert the resulting four 50-bit quantities and
31C transfer them to the integer unit.
32
33C Possible optimizations:
34C   1. Align the stack area where we transfer the four 50-bit product-sums
35C      to a 32-byte boundary.  That would minimize the cache collision.
36C      (UltraSPARC-1/2 use a direct-mapped cache.)  (Perhaps even better would
37C      be to align the area to map to the area immediately before up?)
38C   2. Perform two of the fp->int conversions with integer instructions.  We
39C      can get almost ten free IEU slots, if we clean up bookkeeping and the
40C      silly carry-limb code.
41C   3. For an mpn_addmul_1 based on this, we need to fix the silly carry-limb
42C      code.
43
44C OSP (Overlapping software pipeline) version of mpn_mul_basecase:
45C Operand swap will require 8 LDDA and 8 FXTOD, which will mean 8 cycles.
46C FI	= 20
47C L	=  9 x un * vn
48C WDFI	= 10 x vn / 2
49C WD	= 4
50
51C Instruction classification (as per UltraSPARC functional units).
52C Assuming silly carry code is fixed.  Includes bookkeeping.
53C
54C               mpn_addmul_X     mpn_mul_X
55C                1       2       1       2
56C               ==========      ==========
57C      FM        8      16       8      16
58C      FA       10      18      10      18
59C     MEM       12      12      10      10
60C  ISHIFT        6       6       6       6
61C IADDLOG       11      11      10      10
62C  BRANCH        1       1       1       1
63C
64C TOTAL IEU     17      17      16      16
65C TOTAL         48      64      45      61
66C
67C IEU cycles     8.5     8.5     8       8
68C MEM cycles    12      12      10      10
69C ISSUE cycles  12      16      11.25   15.25
70C FPU cycles    10      18      10      18
71C cycles/loop   12      18      12      18
72C cycles/limb   12       9      12       9
73
74
75C INPUT PARAMETERS
76C rp[n + 1]	i0
77C up[n]		i1
78C n		i2
79C vp[2]		i3
80
81
82ASM_START()
83	REGISTER(%g2,#scratch)
84	REGISTER(%g3,#scratch)
85
86C Combine registers:
87C u00_hi= u32_hi
88C u00_lo= u32_lo
89C a000  = out000
90C a016  = out016
91C Free: f52 f54
92
93
94define(`p000', `%f8')  define(`p016',`%f10')
95define(`p032',`%f12')  define(`p048',`%f14')
96define(`p064',`%f16')  define(`p080',`%f18')
97define(`p096a',`%f20') define(`p112a',`%f22')
98define(`p096b',`%f56') define(`p112b',`%f58')
99
100define(`out000',`%f0') define(`out016',`%f6')
101
102define(`v000',`%f24')  define(`v016',`%f26')
103define(`v032',`%f28')  define(`v048',`%f30')
104define(`v064',`%f44')  define(`v080',`%f46')
105define(`v096',`%f48')  define(`v112',`%f50')
106
107define(`u00',`%f32')   define(`u32', `%f34')
108
109define(`a000',`%f36')  define(`a016',`%f38')
110define(`a032',`%f40')  define(`a048',`%f42')
111define(`a064',`%f60')  define(`a080',`%f62')
112
113define(`u00_hi',`%f2') define(`u32_hi',`%f4')
114define(`u00_lo',`%f3') define(`u32_lo',`%f5')
115
116define(`cy',`%g1')
117define(`rlimb',`%g3')
118define(`i00',`%l0')    define(`i16',`%l1')
119define(`r00',`%l2')    define(`r32',`%l3')
120define(`xffffffff',`%l7')
121define(`xffff',`%o0')
122
123
124PROLOGUE(mpn_addmul_2)
125
126C Initialization.  (1) Split v operand into eight 16-bit chunks and store them
127C as IEEE double in fp registers.  (2) Clear upper 32 bits of fp register pairs
128C f2 and f4.  (3) Store masks in registers aliased to `xffff' and `xffffffff'.
129C This code could be better scheduled.
130
131	save	%sp, -256, %sp
132
133ifdef(`HAVE_VIS',
134`	mov	-1, %g4
135	wr	%g0, 0xD2, %asi
136	srlx	%g4, 32, xffffffff	C store mask in register `xffffffff'
137	ldda	[%i3+6] %asi, v000
138	ldda	[%i3+4] %asi, v016
139	ldda	[%i3+2] %asi, v032
140	ldda	[%i3+0] %asi, v048
141	fxtod	v000, v000
142	ldda	[%i3+14] %asi, v064
143	fxtod	v016, v016
144	ldda	[%i3+12] %asi, v080
145	fxtod	v032, v032
146	ldda	[%i3+10] %asi, v096
147	fxtod	v048, v048
148	ldda	[%i3+8] %asi, v112
149	fxtod	v064, v064
150	fxtod	v080, v080
151	fxtod	v096, v096
152	fxtod	v112, v112
153	fzero	u00_hi
154	fzero	u32_hi
155',
156`	mov	-1, %g4
157	ldx	[%i3+0], %l0		C vp[0]
158	srlx	%g4, 48, xffff		C store mask in register `xffff'
159	ldx	[%i3+8], %l1		C vp[1]
160
161	and	%l0, xffff, %g2
162	stx	%g2, [%sp+2223+0]
163	srlx	%l0, 16, %g3
164	and	%g3, xffff, %g3
165	stx	%g3, [%sp+2223+8]
166	srlx	%l0, 32, %g2
167	and	%g2, xffff, %g2
168	stx	%g2, [%sp+2223+16]
169	srlx	%l0, 48, %g3
170	stx	%g3, [%sp+2223+24]
171	and	%l1, xffff, %g2
172	stx	%g2, [%sp+2223+32]
173	srlx	%l1, 16, %g3
174	and	%g3, xffff, %g3
175	stx	%g3, [%sp+2223+40]
176	srlx	%l1, 32, %g2
177	and	%g2, xffff, %g2
178	stx	%g2, [%sp+2223+48]
179	srlx	%l1, 48, %g3
180	stx	%g3, [%sp+2223+56]
181
182	srlx	%g4, 32, xffffffff	C store mask in register `xffffffff'
183
184	ldd	[%sp+2223+0], v000
185	ldd	[%sp+2223+8], v016
186	ldd	[%sp+2223+16], v032
187	ldd	[%sp+2223+24], v048
188	fxtod	v000, v000
189	ldd	[%sp+2223+32], v064
190	fxtod	v016, v016
191	ldd	[%sp+2223+40], v080
192	fxtod	v032, v032
193	ldd	[%sp+2223+48], v096
194	fxtod	v048, v048
195	ldd	[%sp+2223+56], v112
196	fxtod	v064, v064
197	ld	[%sp+2223+0], u00_hi	C zero u00_hi
198	fxtod	v080, v080
199	ld	[%sp+2223+0], u32_hi	C zero u32_hi
200	fxtod	v096, v096
201	fxtod	v112, v112
202')
203C Initialization done.
204	mov	0, %g2
205	mov	0, rlimb
206	mov	0, %g4
207	add	%i0, -8, %i0		C BOOKKEEPING
208
209C Start software pipeline.
210
211	ld	[%i1+4], u00_lo		C read low 32 bits of up[i]
212	fxtod	u00_hi, u00
213C mid
214	ld	[%i1+0], u32_lo		C read high 32 bits of up[i]
215	fmuld	u00, v000, a000
216	fmuld	u00, v016, a016
217	fmuld	u00, v032, a032
218	fmuld	u00, v048, a048
219	add	%i2, -1, %i2		C BOOKKEEPING
220	fmuld	u00, v064, p064
221	add	%i1, 8, %i1		C BOOKKEEPING
222	fxtod	u32_hi, u32
223	fmuld	u00, v080, p080
224	fmuld	u00, v096, p096a
225	brnz,pt	%i2, .L_2_or_more
226	 fmuld	u00, v112, p112a
227
228.L1:	fdtox	a000, out000
229	fmuld	u32, v000, p000
230	fdtox	a016, out016
231	fmuld	u32, v016, p016
232	fmovd	p064, a064
233	fmuld	u32, v032, p032
234	fmovd	p080, a080
235	fmuld	u32, v048, p048
236	std	out000, [%sp+2223+16]
237	faddd	p000, a032, a000
238	fmuld	u32, v064, p064
239	std	out016, [%sp+2223+24]
240	fxtod	u00_hi, u00
241	faddd	p016, a048, a016
242	fmuld	u32, v080, p080
243	faddd	p032, a064, a032
244	fmuld	u32, v096, p096b
245	faddd	p048, a080, a048
246	fmuld	u32, v112, p112b
247C mid
248	fdtox	a000, out000
249	fdtox	a016, out016
250	faddd	p064, p096a, a064
251	faddd	p080, p112a, a080
252	std	out000, [%sp+2223+0]
253	b	.L_wd2
254	 std	out016, [%sp+2223+8]
255
256.L_2_or_more:
257	ld	[%i1+4], u00_lo		C read low 32 bits of up[i]
258	fdtox	a000, out000
259	fmuld	u32, v000, p000
260	fdtox	a016, out016
261	fmuld	u32, v016, p016
262	fmovd	p064, a064
263	fmuld	u32, v032, p032
264	fmovd	p080, a080
265	fmuld	u32, v048, p048
266	std	out000, [%sp+2223+16]
267	faddd	p000, a032, a000
268	fmuld	u32, v064, p064
269	std	out016, [%sp+2223+24]
270	fxtod	u00_hi, u00
271	faddd	p016, a048, a016
272	fmuld	u32, v080, p080
273	faddd	p032, a064, a032
274	fmuld	u32, v096, p096b
275	faddd	p048, a080, a048
276	fmuld	u32, v112, p112b
277C mid
278	ld	[%i1+0], u32_lo		C read high 32 bits of up[i]
279	fdtox	a000, out000
280	fmuld	u00, v000, p000
281	fdtox	a016, out016
282	fmuld	u00, v016, p016
283	faddd	p064, p096a, a064
284	fmuld	u00, v032, p032
285	faddd	p080, p112a, a080
286	fmuld	u00, v048, p048
287	add	%i2, -1, %i2		C BOOKKEEPING
288	std	out000, [%sp+2223+0]
289	faddd	p000, a032, a000
290	fmuld	u00, v064, p064
291	add	%i1, 8, %i1		C BOOKKEEPING
292	std	out016, [%sp+2223+8]
293	fxtod	u32_hi, u32
294	faddd	p016, a048, a016
295	fmuld	u00, v080, p080
296	faddd	p032, a064, a032
297	fmuld	u00, v096, p096a
298	faddd	p048, a080, a048
299	brnz,pt	%i2, .L_3_or_more
300	 fmuld	u00, v112, p112a
301
302	b	.Lend
303	 nop
304
305C  64      32       0
306C   .       .       .
307C   .       |__rXXX_|	32
308C   .      |___cy___|	34
309C   .  |_______i00__|	50
310C  |_______i16__|   .	50
311
312
313C BEGIN MAIN LOOP
314	.align	16
315.L_3_or_more:
316.Loop:	ld	[%i1+4], u00_lo		C read low 32 bits of up[i]
317	and	%g2, xffffffff, %g2
318	fdtox	a000, out000
319	fmuld	u32, v000, p000
320C
321	lduw	[%i0+4+8], r00		C read low 32 bits of rp[i]
322	add	%g2, rlimb, %l5
323	fdtox	a016, out016
324	fmuld	u32, v016, p016
325C
326	srlx	%l5, 32, cy
327	ldx	[%sp+2223+16], i00
328	faddd	p064, p096b, a064
329	fmuld	u32, v032, p032
330C
331	add	%g4, cy, cy		C new cy
332	ldx	[%sp+2223+24], i16
333	faddd	p080, p112b, a080
334	fmuld	u32, v048, p048
335C
336	nop
337	std	out000, [%sp+2223+16]
338	faddd	p000, a032, a000
339	fmuld	u32, v064, p064
340C
341	add	i00, r00, rlimb
342	add	%i0, 8, %i0		C BOOKKEEPING
343	std	out016, [%sp+2223+24]
344	fxtod	u00_hi, u00
345C
346	sllx	i16, 16, %g2
347	add	cy, rlimb, rlimb
348	faddd	p016, a048, a016
349	fmuld	u32, v080, p080
350C
351	srlx	i16, 16, %g4
352	add	%g2, rlimb, %l5
353	faddd	p032, a064, a032
354	fmuld	u32, v096, p096b
355C
356	stw	%l5, [%i0+4]
357	nop
358	faddd	p048, a080, a048
359	fmuld	u32, v112, p112b
360C midloop
361	ld	[%i1+0], u32_lo		C read high 32 bits of up[i]
362	and	%g2, xffffffff, %g2
363	fdtox	a000, out000
364	fmuld	u00, v000, p000
365C
366	lduw	[%i0+0], r32		C read high 32 bits of rp[i]
367	add	%g2, rlimb, %l5
368	fdtox	a016, out016
369	fmuld	u00, v016, p016
370C
371	srlx	%l5, 32, cy
372	ldx	[%sp+2223+0], i00
373	faddd	p064, p096a, a064
374	fmuld	u00, v032, p032
375C
376	add	%g4, cy, cy		C new cy
377	ldx	[%sp+2223+8], i16
378	faddd	p080, p112a, a080
379	fmuld	u00, v048, p048
380C
381	add	%i2, -1, %i2		C BOOKKEEPING
382	std	out000, [%sp+2223+0]
383	faddd	p000, a032, a000
384	fmuld	u00, v064, p064
385C
386	add	i00, r32, rlimb
387	add	%i1, 8, %i1		C BOOKKEEPING
388	std	out016, [%sp+2223+8]
389	fxtod	u32_hi, u32
390C
391	sllx	i16, 16, %g2
392	add	cy, rlimb, rlimb
393	faddd	p016, a048, a016
394	fmuld	u00, v080, p080
395C
396	srlx	i16, 16, %g4
397	add	%g2, rlimb, %l5
398	faddd	p032, a064, a032
399	fmuld	u00, v096, p096a
400C
401	stw	%l5, [%i0+0]
402	faddd	p048, a080, a048
403	brnz,pt	%i2, .Loop
404	 fmuld	u00, v112, p112a
405C END MAIN LOOP
406
407C WIND-DOWN PHASE 1
408.Lend:	and	%g2, xffffffff, %g2
409	fdtox	a000, out000
410	fmuld	u32, v000, p000
411	lduw	[%i0+4+8], r00		C read low 32 bits of rp[i]
412	add	%g2, rlimb, %l5
413	fdtox	a016, out016
414	fmuld	u32, v016, p016
415	srlx	%l5, 32, cy
416	ldx	[%sp+2223+16], i00
417	faddd	p064, p096b, a064
418	fmuld	u32, v032, p032
419	add	%g4, cy, cy		C new cy
420	ldx	[%sp+2223+24], i16
421	faddd	p080, p112b, a080
422	fmuld	u32, v048, p048
423	std	out000, [%sp+2223+16]
424	faddd	p000, a032, a000
425	fmuld	u32, v064, p064
426	add	i00, r00, rlimb
427	add	%i0, 8, %i0		C BOOKKEEPING
428	std	out016, [%sp+2223+24]
429	sllx	i16, 16, %g2
430	add	cy, rlimb, rlimb
431	faddd	p016, a048, a016
432	fmuld	u32, v080, p080
433	srlx	i16, 16, %g4
434	add	%g2, rlimb, %l5
435	faddd	p032, a064, a032
436	fmuld	u32, v096, p096b
437	stw	%l5, [%i0+4]
438	faddd	p048, a080, a048
439	fmuld	u32, v112, p112b
440C mid
441	and	%g2, xffffffff, %g2
442	fdtox	a000, out000
443	lduw	[%i0+0], r32		C read high 32 bits of rp[i]
444	add	%g2, rlimb, %l5
445	fdtox	a016, out016
446	srlx	%l5, 32, cy
447	ldx	[%sp+2223+0], i00
448	faddd	p064, p096a, a064
449	add	%g4, cy, cy		C new cy
450	ldx	[%sp+2223+8], i16
451	faddd	p080, p112a, a080
452	std	out000, [%sp+2223+0]
453	add	i00, r32, rlimb
454	std	out016, [%sp+2223+8]
455	sllx	i16, 16, %g2
456	add	cy, rlimb, rlimb
457	srlx	i16, 16, %g4
458	add	%g2, rlimb, %l5
459	stw	%l5, [%i0+0]
460
461C WIND-DOWN PHASE 2
462.L_wd2:	and	%g2, xffffffff, %g2
463	fdtox	a032, out000
464	lduw	[%i0+4+8], r00		C read low 32 bits of rp[i]
465	add	%g2, rlimb, %l5
466	fdtox	a048, out016
467	srlx	%l5, 32, cy
468	ldx	[%sp+2223+16], i00
469	add	%g4, cy, cy		C new cy
470	ldx	[%sp+2223+24], i16
471	std	out000, [%sp+2223+16]
472	add	i00, r00, rlimb
473	add	%i0, 8, %i0		C BOOKKEEPING
474	std	out016, [%sp+2223+24]
475	sllx	i16, 16, %g2
476	add	cy, rlimb, rlimb
477	srlx	i16, 16, %g4
478	add	%g2, rlimb, %l5
479	stw	%l5, [%i0+4]
480C mid
481	and	%g2, xffffffff, %g2
482	fdtox	a064, out000
483	lduw	[%i0+0], r32		C read high 32 bits of rp[i]
484	add	%g2, rlimb, %l5
485	fdtox	a080, out016
486	srlx	%l5, 32, cy
487	ldx	[%sp+2223+0], i00
488	add	%g4, cy, cy		C new cy
489	ldx	[%sp+2223+8], i16
490	std	out000, [%sp+2223+0]
491	add	i00, r32, rlimb
492	std	out016, [%sp+2223+8]
493	sllx	i16, 16, %g2
494	add	cy, rlimb, rlimb
495	srlx	i16, 16, %g4
496	add	%g2, rlimb, %l5
497	stw	%l5, [%i0+0]
498
499C WIND-DOWN PHASE 3
500.L_wd3:	and	%g2, xffffffff, %g2
501	fdtox	p096b, out000
502	add	%g2, rlimb, %l5
503	fdtox	p112b, out016
504	srlx	%l5, 32, cy
505	ldx	[%sp+2223+16], rlimb
506	add	%g4, cy, cy		C new cy
507	ldx	[%sp+2223+24], i16
508	std	out000, [%sp+2223+16]
509	add	%i0, 8, %i0		C BOOKKEEPING
510	std	out016, [%sp+2223+24]
511	sllx	i16, 16, %g2
512	add	cy, rlimb, rlimb
513	srlx	i16, 16, %g4
514	add	%g2, rlimb, %l5
515	stw	%l5, [%i0+4]
516C mid
517	and	%g2, xffffffff, %g2
518	add	%g2, rlimb, %l5
519	srlx	%l5, 32, cy
520	ldx	[%sp+2223+0], rlimb
521	add	%g4, cy, cy		C new cy
522	ldx	[%sp+2223+8], i16
523	sllx	i16, 16, %g2
524	add	cy, rlimb, rlimb
525	srlx	i16, 16, %g4
526	add	%g2, rlimb, %l5
527	stw	%l5, [%i0+0]
528
529	and	%g2, xffffffff, %g2
530	add	%g2, rlimb, %l5
531	srlx	%l5, 32, cy
532	ldx	[%sp+2223+16], i00
533	add	%g4, cy, cy		C new cy
534	ldx	[%sp+2223+24], i16
535
536	sllx	i16, 16, %g2
537	add	i00, cy, cy
538	return	%i7+8
539	add	%g2, cy, %o0
540EPILOGUE(mpn_addmul_2)
541