1dnl  SPARC v9 mpn_add_n -- Add two limb vectors of the same length > 0 and
2dnl  store sum in a third limb vector.
3
4dnl  Copyright 2001, 2002, 2003 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C		   cycles/limb
24C UltraSPARC 1&2:     4
25C UltraSPARC 3:	      4.5
26
27C Compute carry-out from the most significant bits of u,v, and r, where
28C r=u+v+carry_in, using logic operations.
29
30C This code runs at 4 cycles/limb on UltraSPARC 1 and 2.  It has a 4 insn
31C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated.
32C Therefore, it seems futile to try to optimize this any further...
33
34C INPUT PARAMETERS
35define(`rp',`%i0')
36define(`up',`%i1')
37define(`vp',`%i2')
38define(`n',`%i3')
39
40define(`u0',`%l0')
41define(`u1',`%l2')
42define(`u2',`%l4')
43define(`u3',`%l6')
44define(`v0',`%l1')
45define(`v1',`%l3')
46define(`v2',`%l5')
47define(`v3',`%l7')
48
49define(`cy',`%i4')
50
51define(`fanop',`fitod %f0,%f2')		dnl  A quasi nop running in the FA pipe
52define(`fmnop',`fmuld %f0,%f0,%f4')	dnl  A quasi nop running in the FM pipe
53
54ASM_START()
55	REGISTER(%g2,#scratch)
56	REGISTER(%g3,#scratch)
57PROLOGUE(mpn_add_n)
58	save	%sp,-160,%sp
59
60	fitod	%f0,%f0		C make sure f0 contains small, quiet number
61	subcc	n,4,%g0
62	bl,pn	%icc,.Loop0
63	mov	0,cy
64
65	ldx	[up+0],u0
66	ldx	[vp+0],v0
67	add	up,32,up
68	ldx	[up-24],u1
69	ldx	[vp+8],v1
70	add	vp,32,vp
71	ldx	[up-16],u2
72	ldx	[vp-16],v2
73	ldx	[up-8],u3
74	ldx	[vp-8],v3
75	subcc	n,8,n
76	add	u0,v0,%g1	C main add
77	add	%g1,cy,%g4	C carry add
78	or	u0,v0,%g2
79	bl,pn	%icc,.Lend4567
80	fanop
81	b,a	.Loop
82
83	.align	16
84C START MAIN LOOP
85.Loop:	andn	%g2,%g4,%g2
86	and	u0,v0,%g3
87	ldx	[up+0],u0
88	fanop
89C --
90	or	%g3,%g2,%g2
91	ldx	[vp+0],v0
92	add	up,32,up
93	fanop
94C --
95	srlx	%g2,63,cy
96	add	u1,v1,%g1
97	stx	%g4,[rp+0]
98	fanop
99C --
100	add	%g1,cy,%g4
101	or	u1,v1,%g2
102	fmnop
103	fanop
104C --
105	andn	%g2,%g4,%g2
106	and	u1,v1,%g3
107	ldx	[up-24],u1
108	fanop
109C --
110	or	%g3,%g2,%g2
111	ldx	[vp+8],v1
112	add	vp,32,vp
113	fanop
114C --
115	srlx	%g2,63,cy
116	add	u2,v2,%g1
117	stx	%g4,[rp+8]
118	fanop
119C --
120	add	%g1,cy,%g4
121	or	u2,v2,%g2
122	fmnop
123	fanop
124C --
125	andn	%g2,%g4,%g2
126	and	u2,v2,%g3
127	ldx	[up-16],u2
128	fanop
129C --
130	or	%g3,%g2,%g2
131	ldx	[vp-16],v2
132	add	rp,32,rp
133	fanop
134C --
135	srlx	%g2,63,cy
136	add	u3,v3,%g1
137	stx	%g4,[rp-16]
138	fanop
139C --
140	add	%g1,cy,%g4
141	or	u3,v3,%g2
142	fmnop
143	fanop
144C --
145	andn	%g2,%g4,%g2
146	and	u3,v3,%g3
147	ldx	[up-8],u3
148	fanop
149C --
150	or	%g3,%g2,%g2
151	subcc	n,4,n
152	ldx	[vp-8],v3
153	fanop
154C --
155	srlx	%g2,63,cy
156	add	u0,v0,%g1
157	stx	%g4,[rp-8]
158	fanop
159C --
160	add	%g1,cy,%g4
161	or	u0,v0,%g2
162	bge,pt	%icc,.Loop
163	fanop
164C END MAIN LOOP
165.Lend4567:
166	andn	%g2,%g4,%g2
167	and	u0,v0,%g3
168	or	%g3,%g2,%g2
169	srlx	%g2,63,cy
170	add	u1,v1,%g1
171	stx	%g4,[rp+0]
172	add	%g1,cy,%g4
173	or	u1,v1,%g2
174	andn	%g2,%g4,%g2
175	and	u1,v1,%g3
176	or	%g3,%g2,%g2
177	srlx	%g2,63,cy
178	add	u2,v2,%g1
179	stx	%g4,[rp+8]
180	add	%g1,cy,%g4
181	or	u2,v2,%g2
182	andn	%g2,%g4,%g2
183	and	u2,v2,%g3
184	or	%g3,%g2,%g2
185	add	rp,32,rp
186	srlx	%g2,63,cy
187	add	u3,v3,%g1
188	stx	%g4,[rp-16]
189	add	%g1,cy,%g4
190	or	u3,v3,%g2
191	andn	%g2,%g4,%g2
192	and	u3,v3,%g3
193	or	%g3,%g2,%g2
194	srlx	%g2,63,cy
195	stx	%g4,[rp-8]
196
197	addcc	n,4,n
198	bz,pn	%icc,.Lret
199	fanop
200
201.Loop0:	ldx	[up],u0
202	add	up,8,up
203	ldx	[vp],v0
204	add	vp,8,vp
205	add	rp,8,rp
206	subcc	n,1,n
207	add	u0,v0,%g1
208	or	u0,v0,%g2
209	add	%g1,cy,%g4
210	and	u0,v0,%g3
211	andn	%g2,%g4,%g2
212	stx	%g4,[rp-8]
213	or	%g3,%g2,%g2
214	bnz,pt	%icc,.Loop0
215	srlx	%g2,63,cy
216
217.Lret:	mov	cy,%i0
218	ret
219	restore
220EPILOGUE(mpn_add_n)
221