1dnl  SPARC v9 64-bit mpn_addmul_2 -- Multiply an n limb number with 2-limb
2dnl  number and add the result to a n limb vector.
3
4dnl  Copyright 2002, 2003 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34C                  cycles/limb
35C UltraSPARC 1&2:      9
36C UltraSPARC 3:       10
37
38C Algorithm: We use 16 floating-point multiplies per limb product, with the
39C 2-limb v operand split into eight 16-bit pieces, and the n-limb u operand
40C split into 32-bit pieces.  We sum four 48-bit partial products using
41C floating-point add, then convert the resulting four 50-bit quantities and
42C transfer them to the integer unit.
43
44C Possible optimizations:
45C   1. Align the stack area where we transfer the four 50-bit product-sums
46C      to a 32-byte boundary.  That would minimize the cache collision.
47C      (UltraSPARC-1/2 use a direct-mapped cache.)  (Perhaps even better would
48C      be to align the area to map to the area immediately before up?)
49C   2. Perform two of the fp->int conversions with integer instructions.  We
50C      can get almost ten free IEU slots, if we clean up bookkeeping and the
51C      silly carry-limb code.
52C   3. For an mpn_addmul_1 based on this, we need to fix the silly carry-limb
53C      code.
54
55C OSP (Overlapping software pipeline) version of mpn_mul_basecase:
56C Operand swap will require 8 LDDA and 8 FXTOD, which will mean 8 cycles.
57C FI	= 20
58C L	=  9 x un * vn
59C WDFI	= 10 x vn / 2
60C WD	= 4
61
62C Instruction classification (as per UltraSPARC functional units).
63C Assuming silly carry code is fixed.  Includes bookkeeping.
64C
65C               mpn_addmul_X     mpn_mul_X
66C                1       2       1       2
67C               ==========      ==========
68C      FM        8      16       8      16
69C      FA       10      18      10      18
70C     MEM       12      12      10      10
71C  ISHIFT        6       6       6       6
72C IADDLOG       11      11      10      10
73C  BRANCH        1       1       1       1
74C
75C TOTAL IEU     17      17      16      16
76C TOTAL         48      64      45      61
77C
78C IEU cycles     8.5     8.5     8       8
79C MEM cycles    12      12      10      10
80C ISSUE cycles  12      16      11.25   15.25
81C FPU cycles    10      18      10      18
82C cycles/loop   12      18      12      18
83C cycles/limb   12       9      12       9
84
85
86C INPUT PARAMETERS
87C rp[n + 1]	i0
88C up[n]		i1
89C n		i2
90C vp[2]		i3
91
92
93ASM_START()
94	REGISTER(%g2,#scratch)
95	REGISTER(%g3,#scratch)
96
97C Combine registers:
98C u00_hi= u32_hi
99C u00_lo= u32_lo
100C a000  = out000
101C a016  = out016
102C Free: f52 f54
103
104
105define(`p000', `%f8')  define(`p016',`%f10')
106define(`p032',`%f12')  define(`p048',`%f14')
107define(`p064',`%f16')  define(`p080',`%f18')
108define(`p096a',`%f20') define(`p112a',`%f22')
109define(`p096b',`%f56') define(`p112b',`%f58')
110
111define(`out000',`%f0') define(`out016',`%f6')
112
113define(`v000',`%f24')  define(`v016',`%f26')
114define(`v032',`%f28')  define(`v048',`%f30')
115define(`v064',`%f44')  define(`v080',`%f46')
116define(`v096',`%f48')  define(`v112',`%f50')
117
118define(`u00',`%f32')   define(`u32', `%f34')
119
120define(`a000',`%f36')  define(`a016',`%f38')
121define(`a032',`%f40')  define(`a048',`%f42')
122define(`a064',`%f60')  define(`a080',`%f62')
123
124define(`u00_hi',`%f2') define(`u32_hi',`%f4')
125define(`u00_lo',`%f3') define(`u32_lo',`%f5')
126
127define(`cy',`%g1')
128define(`rlimb',`%g3')
129define(`i00',`%l0')    define(`i16',`%l1')
130define(`r00',`%l2')    define(`r32',`%l3')
131define(`xffffffff',`%l7')
132define(`xffff',`%o0')
133
134
135PROLOGUE(mpn_addmul_2)
136
137C Initialization.  (1) Split v operand into eight 16-bit chunks and store them
138C as IEEE double in fp registers.  (2) Clear upper 32 bits of fp register pairs
139C f2 and f4.  (3) Store masks in registers aliased to `xffff' and `xffffffff'.
140C This code could be better scheduled.
141
142	save	%sp, -256, %sp
143
144ifdef(`HAVE_VIS',
145`	mov	-1, %g4
146	wr	%g0, 0xD2, %asi
147	srlx	%g4, 32, xffffffff	C store mask in register `xffffffff'
148	ldda	[%i3+6] %asi, v000
149	ldda	[%i3+4] %asi, v016
150	ldda	[%i3+2] %asi, v032
151	ldda	[%i3+0] %asi, v048
152	fxtod	v000, v000
153	ldda	[%i3+14] %asi, v064
154	fxtod	v016, v016
155	ldda	[%i3+12] %asi, v080
156	fxtod	v032, v032
157	ldda	[%i3+10] %asi, v096
158	fxtod	v048, v048
159	ldda	[%i3+8] %asi, v112
160	fxtod	v064, v064
161	fxtod	v080, v080
162	fxtod	v096, v096
163	fxtod	v112, v112
164	fzero	u00_hi
165	fzero	u32_hi
166',
167`	mov	-1, %g4
168	ldx	[%i3+0], %l0		C vp[0]
169	srlx	%g4, 48, xffff		C store mask in register `xffff'
170	ldx	[%i3+8], %l1		C vp[1]
171
172	and	%l0, xffff, %g2
173	stx	%g2, [%sp+2223+0]
174	srlx	%l0, 16, %g3
175	and	%g3, xffff, %g3
176	stx	%g3, [%sp+2223+8]
177	srlx	%l0, 32, %g2
178	and	%g2, xffff, %g2
179	stx	%g2, [%sp+2223+16]
180	srlx	%l0, 48, %g3
181	stx	%g3, [%sp+2223+24]
182	and	%l1, xffff, %g2
183	stx	%g2, [%sp+2223+32]
184	srlx	%l1, 16, %g3
185	and	%g3, xffff, %g3
186	stx	%g3, [%sp+2223+40]
187	srlx	%l1, 32, %g2
188	and	%g2, xffff, %g2
189	stx	%g2, [%sp+2223+48]
190	srlx	%l1, 48, %g3
191	stx	%g3, [%sp+2223+56]
192
193	srlx	%g4, 32, xffffffff	C store mask in register `xffffffff'
194
195	ldd	[%sp+2223+0], v000
196	ldd	[%sp+2223+8], v016
197	ldd	[%sp+2223+16], v032
198	ldd	[%sp+2223+24], v048
199	fxtod	v000, v000
200	ldd	[%sp+2223+32], v064
201	fxtod	v016, v016
202	ldd	[%sp+2223+40], v080
203	fxtod	v032, v032
204	ldd	[%sp+2223+48], v096
205	fxtod	v048, v048
206	ldd	[%sp+2223+56], v112
207	fxtod	v064, v064
208	ld	[%sp+2223+0], u00_hi	C zero u00_hi
209	fxtod	v080, v080
210	ld	[%sp+2223+0], u32_hi	C zero u32_hi
211	fxtod	v096, v096
212	fxtod	v112, v112
213')
214C Initialization done.
215	mov	0, %g2
216	mov	0, rlimb
217	mov	0, %g4
218	add	%i0, -8, %i0		C BOOKKEEPING
219
220C Start software pipeline.
221
222	ld	[%i1+4], u00_lo		C read low 32 bits of up[i]
223	fxtod	u00_hi, u00
224C mid
225	ld	[%i1+0], u32_lo		C read high 32 bits of up[i]
226	fmuld	u00, v000, a000
227	fmuld	u00, v016, a016
228	fmuld	u00, v032, a032
229	fmuld	u00, v048, a048
230	add	%i2, -1, %i2		C BOOKKEEPING
231	fmuld	u00, v064, p064
232	add	%i1, 8, %i1		C BOOKKEEPING
233	fxtod	u32_hi, u32
234	fmuld	u00, v080, p080
235	fmuld	u00, v096, p096a
236	brnz,pt	%i2, .L_2_or_more
237	 fmuld	u00, v112, p112a
238
239.L1:	fdtox	a000, out000
240	fmuld	u32, v000, p000
241	fdtox	a016, out016
242	fmuld	u32, v016, p016
243	fmovd	p064, a064
244	fmuld	u32, v032, p032
245	fmovd	p080, a080
246	fmuld	u32, v048, p048
247	std	out000, [%sp+2223+16]
248	faddd	p000, a032, a000
249	fmuld	u32, v064, p064
250	std	out016, [%sp+2223+24]
251	fxtod	u00_hi, u00
252	faddd	p016, a048, a016
253	fmuld	u32, v080, p080
254	faddd	p032, a064, a032
255	fmuld	u32, v096, p096b
256	faddd	p048, a080, a048
257	fmuld	u32, v112, p112b
258C mid
259	fdtox	a000, out000
260	fdtox	a016, out016
261	faddd	p064, p096a, a064
262	faddd	p080, p112a, a080
263	std	out000, [%sp+2223+0]
264	b	.L_wd2
265	 std	out016, [%sp+2223+8]
266
267.L_2_or_more:
268	ld	[%i1+4], u00_lo		C read low 32 bits of up[i]
269	fdtox	a000, out000
270	fmuld	u32, v000, p000
271	fdtox	a016, out016
272	fmuld	u32, v016, p016
273	fmovd	p064, a064
274	fmuld	u32, v032, p032
275	fmovd	p080, a080
276	fmuld	u32, v048, p048
277	std	out000, [%sp+2223+16]
278	faddd	p000, a032, a000
279	fmuld	u32, v064, p064
280	std	out016, [%sp+2223+24]
281	fxtod	u00_hi, u00
282	faddd	p016, a048, a016
283	fmuld	u32, v080, p080
284	faddd	p032, a064, a032
285	fmuld	u32, v096, p096b
286	faddd	p048, a080, a048
287	fmuld	u32, v112, p112b
288C mid
289	ld	[%i1+0], u32_lo		C read high 32 bits of up[i]
290	fdtox	a000, out000
291	fmuld	u00, v000, p000
292	fdtox	a016, out016
293	fmuld	u00, v016, p016
294	faddd	p064, p096a, a064
295	fmuld	u00, v032, p032
296	faddd	p080, p112a, a080
297	fmuld	u00, v048, p048
298	add	%i2, -1, %i2		C BOOKKEEPING
299	std	out000, [%sp+2223+0]
300	faddd	p000, a032, a000
301	fmuld	u00, v064, p064
302	add	%i1, 8, %i1		C BOOKKEEPING
303	std	out016, [%sp+2223+8]
304	fxtod	u32_hi, u32
305	faddd	p016, a048, a016
306	fmuld	u00, v080, p080
307	faddd	p032, a064, a032
308	fmuld	u00, v096, p096a
309	faddd	p048, a080, a048
310	brnz,pt	%i2, .L_3_or_more
311	 fmuld	u00, v112, p112a
312
313	b	.Lend
314	 nop
315
316C  64      32       0
317C   .       .       .
318C   .       |__rXXX_|	32
319C   .      |___cy___|	34
320C   .  |_______i00__|	50
321C  |_______i16__|   .	50
322
323
324C BEGIN MAIN LOOP
325	.align	16
326.L_3_or_more:
327.Loop:	ld	[%i1+4], u00_lo		C read low 32 bits of up[i]
328	and	%g2, xffffffff, %g2
329	fdtox	a000, out000
330	fmuld	u32, v000, p000
331C
332	lduw	[%i0+4+8], r00		C read low 32 bits of rp[i]
333	add	%g2, rlimb, %l5
334	fdtox	a016, out016
335	fmuld	u32, v016, p016
336C
337	srlx	%l5, 32, cy
338	ldx	[%sp+2223+16], i00
339	faddd	p064, p096b, a064
340	fmuld	u32, v032, p032
341C
342	add	%g4, cy, cy		C new cy
343	ldx	[%sp+2223+24], i16
344	faddd	p080, p112b, a080
345	fmuld	u32, v048, p048
346C
347	nop
348	std	out000, [%sp+2223+16]
349	faddd	p000, a032, a000
350	fmuld	u32, v064, p064
351C
352	add	i00, r00, rlimb
353	add	%i0, 8, %i0		C BOOKKEEPING
354	std	out016, [%sp+2223+24]
355	fxtod	u00_hi, u00
356C
357	sllx	i16, 16, %g2
358	add	cy, rlimb, rlimb
359	faddd	p016, a048, a016
360	fmuld	u32, v080, p080
361C
362	srlx	i16, 16, %g4
363	add	%g2, rlimb, %l5
364	faddd	p032, a064, a032
365	fmuld	u32, v096, p096b
366C
367	stw	%l5, [%i0+4]
368	nop
369	faddd	p048, a080, a048
370	fmuld	u32, v112, p112b
371C midloop
372	ld	[%i1+0], u32_lo		C read high 32 bits of up[i]
373	and	%g2, xffffffff, %g2
374	fdtox	a000, out000
375	fmuld	u00, v000, p000
376C
377	lduw	[%i0+0], r32		C read high 32 bits of rp[i]
378	add	%g2, rlimb, %l5
379	fdtox	a016, out016
380	fmuld	u00, v016, p016
381C
382	srlx	%l5, 32, cy
383	ldx	[%sp+2223+0], i00
384	faddd	p064, p096a, a064
385	fmuld	u00, v032, p032
386C
387	add	%g4, cy, cy		C new cy
388	ldx	[%sp+2223+8], i16
389	faddd	p080, p112a, a080
390	fmuld	u00, v048, p048
391C
392	add	%i2, -1, %i2		C BOOKKEEPING
393	std	out000, [%sp+2223+0]
394	faddd	p000, a032, a000
395	fmuld	u00, v064, p064
396C
397	add	i00, r32, rlimb
398	add	%i1, 8, %i1		C BOOKKEEPING
399	std	out016, [%sp+2223+8]
400	fxtod	u32_hi, u32
401C
402	sllx	i16, 16, %g2
403	add	cy, rlimb, rlimb
404	faddd	p016, a048, a016
405	fmuld	u00, v080, p080
406C
407	srlx	i16, 16, %g4
408	add	%g2, rlimb, %l5
409	faddd	p032, a064, a032
410	fmuld	u00, v096, p096a
411C
412	stw	%l5, [%i0+0]
413	faddd	p048, a080, a048
414	brnz,pt	%i2, .Loop
415	 fmuld	u00, v112, p112a
416C END MAIN LOOP
417
418C WIND-DOWN PHASE 1
419.Lend:	and	%g2, xffffffff, %g2
420	fdtox	a000, out000
421	fmuld	u32, v000, p000
422	lduw	[%i0+4+8], r00		C read low 32 bits of rp[i]
423	add	%g2, rlimb, %l5
424	fdtox	a016, out016
425	fmuld	u32, v016, p016
426	srlx	%l5, 32, cy
427	ldx	[%sp+2223+16], i00
428	faddd	p064, p096b, a064
429	fmuld	u32, v032, p032
430	add	%g4, cy, cy		C new cy
431	ldx	[%sp+2223+24], i16
432	faddd	p080, p112b, a080
433	fmuld	u32, v048, p048
434	std	out000, [%sp+2223+16]
435	faddd	p000, a032, a000
436	fmuld	u32, v064, p064
437	add	i00, r00, rlimb
438	add	%i0, 8, %i0		C BOOKKEEPING
439	std	out016, [%sp+2223+24]
440	sllx	i16, 16, %g2
441	add	cy, rlimb, rlimb
442	faddd	p016, a048, a016
443	fmuld	u32, v080, p080
444	srlx	i16, 16, %g4
445	add	%g2, rlimb, %l5
446	faddd	p032, a064, a032
447	fmuld	u32, v096, p096b
448	stw	%l5, [%i0+4]
449	faddd	p048, a080, a048
450	fmuld	u32, v112, p112b
451C mid
452	and	%g2, xffffffff, %g2
453	fdtox	a000, out000
454	lduw	[%i0+0], r32		C read high 32 bits of rp[i]
455	add	%g2, rlimb, %l5
456	fdtox	a016, out016
457	srlx	%l5, 32, cy
458	ldx	[%sp+2223+0], i00
459	faddd	p064, p096a, a064
460	add	%g4, cy, cy		C new cy
461	ldx	[%sp+2223+8], i16
462	faddd	p080, p112a, a080
463	std	out000, [%sp+2223+0]
464	add	i00, r32, rlimb
465	std	out016, [%sp+2223+8]
466	sllx	i16, 16, %g2
467	add	cy, rlimb, rlimb
468	srlx	i16, 16, %g4
469	add	%g2, rlimb, %l5
470	stw	%l5, [%i0+0]
471
472C WIND-DOWN PHASE 2
473.L_wd2:	and	%g2, xffffffff, %g2
474	fdtox	a032, out000
475	lduw	[%i0+4+8], r00		C read low 32 bits of rp[i]
476	add	%g2, rlimb, %l5
477	fdtox	a048, out016
478	srlx	%l5, 32, cy
479	ldx	[%sp+2223+16], i00
480	add	%g4, cy, cy		C new cy
481	ldx	[%sp+2223+24], i16
482	std	out000, [%sp+2223+16]
483	add	i00, r00, rlimb
484	add	%i0, 8, %i0		C BOOKKEEPING
485	std	out016, [%sp+2223+24]
486	sllx	i16, 16, %g2
487	add	cy, rlimb, rlimb
488	srlx	i16, 16, %g4
489	add	%g2, rlimb, %l5
490	stw	%l5, [%i0+4]
491C mid
492	and	%g2, xffffffff, %g2
493	fdtox	a064, out000
494	lduw	[%i0+0], r32		C read high 32 bits of rp[i]
495	add	%g2, rlimb, %l5
496	fdtox	a080, out016
497	srlx	%l5, 32, cy
498	ldx	[%sp+2223+0], i00
499	add	%g4, cy, cy		C new cy
500	ldx	[%sp+2223+8], i16
501	std	out000, [%sp+2223+0]
502	add	i00, r32, rlimb
503	std	out016, [%sp+2223+8]
504	sllx	i16, 16, %g2
505	add	cy, rlimb, rlimb
506	srlx	i16, 16, %g4
507	add	%g2, rlimb, %l5
508	stw	%l5, [%i0+0]
509
510C WIND-DOWN PHASE 3
511.L_wd3:	and	%g2, xffffffff, %g2
512	fdtox	p096b, out000
513	add	%g2, rlimb, %l5
514	fdtox	p112b, out016
515	srlx	%l5, 32, cy
516	ldx	[%sp+2223+16], rlimb
517	add	%g4, cy, cy		C new cy
518	ldx	[%sp+2223+24], i16
519	std	out000, [%sp+2223+16]
520	add	%i0, 8, %i0		C BOOKKEEPING
521	std	out016, [%sp+2223+24]
522	sllx	i16, 16, %g2
523	add	cy, rlimb, rlimb
524	srlx	i16, 16, %g4
525	add	%g2, rlimb, %l5
526	stw	%l5, [%i0+4]
527C mid
528	and	%g2, xffffffff, %g2
529	add	%g2, rlimb, %l5
530	srlx	%l5, 32, cy
531	ldx	[%sp+2223+0], rlimb
532	add	%g4, cy, cy		C new cy
533	ldx	[%sp+2223+8], i16
534	sllx	i16, 16, %g2
535	add	cy, rlimb, rlimb
536	srlx	i16, 16, %g4
537	add	%g2, rlimb, %l5
538	stw	%l5, [%i0+0]
539
540	and	%g2, xffffffff, %g2
541	add	%g2, rlimb, %l5
542	srlx	%l5, 32, cy
543	ldx	[%sp+2223+16], i00
544	add	%g4, cy, cy		C new cy
545	ldx	[%sp+2223+24], i16
546
547	sllx	i16, 16, %g2
548	add	i00, cy, cy
549	return	%i7+8
550	add	%g2, cy, %o0
551EPILOGUE(mpn_addmul_2)
552