1dnl  SPARC v9 mpn_add_n -- Add two limb vectors of the same length > 0 and
2dnl  store sum in a third limb vector.
3
4dnl  Copyright 2001-2003, 2011 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34C		   cycles/limb
35C UltraSPARC 1&2:     4
36C UltraSPARC 3:	      4.5
37
38C Compute carry-out from the most significant bits of u,v, and r, where
39C r=u+v+carry_in, using logic operations.
40
41C This code runs at 4 cycles/limb on UltraSPARC 1 and 2.  It has a 4 insn
42C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated.
43C Therefore, it seems futile to try to optimize this any further...
44
45C INPUT PARAMETERS
46define(`rp', `%i0')
47define(`up', `%i1')
48define(`vp', `%i2')
49define(`n',  `%i3')
50
51define(`u0', `%l0')
52define(`u1', `%l2')
53define(`u2', `%l4')
54define(`u3', `%l6')
55define(`v0', `%l1')
56define(`v1', `%l3')
57define(`v2', `%l5')
58define(`v3', `%l7')
59
60define(`cy',`%i4')
61
62define(`fanop',`fitod %f0,%f2')		dnl  A quasi nop running in the FA pipe
63define(`fmnop',`fmuld %f0,%f0,%f4')	dnl  A quasi nop running in the FM pipe
64
65ASM_START()
66	REGISTER(%g2,#scratch)
67	REGISTER(%g3,#scratch)
68PROLOGUE(mpn_add_nc)
69	save	%sp,-160,%sp
70
71	fitod	%f0,%f0		C make sure f0 contains small, quiet number
72	subcc	n,4,%g0
73	bl,pn	%xcc,.Loop0
74	nop
75	b,a	L(com)
76EPILOGUE()
77
78PROLOGUE(mpn_add_n)
79	save	%sp,-160,%sp
80
81	fitod	%f0,%f0		C make sure f0 contains small, quiet number
82	subcc	n,4,%g0
83	bl,pn	%xcc,.Loop0
84	mov	0,cy
85L(com):
86	ldx	[up+0],u0
87	ldx	[vp+0],v0
88	add	up,32,up
89	ldx	[up-24],u1
90	ldx	[vp+8],v1
91	add	vp,32,vp
92	ldx	[up-16],u2
93	ldx	[vp-16],v2
94	ldx	[up-8],u3
95	ldx	[vp-8],v3
96	subcc	n,8,n
97	add	u0,v0,%g1	C main add
98	add	%g1,cy,%g5	C carry add
99	or	u0,v0,%g2
100	bl,pn	%xcc,.Lend4567
101	fanop
102	b,a	.Loop
103
104	.align	16
105C START MAIN LOOP
106.Loop:	andn	%g2,%g5,%g2
107	and	u0,v0,%g3
108	ldx	[up+0],u0
109	fanop
110C --
111	or	%g3,%g2,%g2
112	ldx	[vp+0],v0
113	add	up,32,up
114	fanop
115C --
116	srlx	%g2,63,cy
117	add	u1,v1,%g1
118	stx	%g5,[rp+0]
119	fanop
120C --
121	add	%g1,cy,%g5
122	or	u1,v1,%g2
123	fmnop
124	fanop
125C --
126	andn	%g2,%g5,%g2
127	and	u1,v1,%g3
128	ldx	[up-24],u1
129	fanop
130C --
131	or	%g3,%g2,%g2
132	ldx	[vp+8],v1
133	add	vp,32,vp
134	fanop
135C --
136	srlx	%g2,63,cy
137	add	u2,v2,%g1
138	stx	%g5,[rp+8]
139	fanop
140C --
141	add	%g1,cy,%g5
142	or	u2,v2,%g2
143	fmnop
144	fanop
145C --
146	andn	%g2,%g5,%g2
147	and	u2,v2,%g3
148	ldx	[up-16],u2
149	fanop
150C --
151	or	%g3,%g2,%g2
152	ldx	[vp-16],v2
153	add	rp,32,rp
154	fanop
155C --
156	srlx	%g2,63,cy
157	add	u3,v3,%g1
158	stx	%g5,[rp-16]
159	fanop
160C --
161	add	%g1,cy,%g5
162	or	u3,v3,%g2
163	fmnop
164	fanop
165C --
166	andn	%g2,%g5,%g2
167	and	u3,v3,%g3
168	ldx	[up-8],u3
169	fanop
170C --
171	or	%g3,%g2,%g2
172	subcc	n,4,n
173	ldx	[vp-8],v3
174	fanop
175C --
176	srlx	%g2,63,cy
177	add	u0,v0,%g1
178	stx	%g5,[rp-8]
179	fanop
180C --
181	add	%g1,cy,%g5
182	or	u0,v0,%g2
183	bge,pt	%xcc,.Loop
184	fanop
185C END MAIN LOOP
186.Lend4567:
187	andn	%g2,%g5,%g2
188	and	u0,v0,%g3
189	or	%g3,%g2,%g2
190	srlx	%g2,63,cy
191	add	u1,v1,%g1
192	stx	%g5,[rp+0]
193	add	%g1,cy,%g5
194	or	u1,v1,%g2
195	andn	%g2,%g5,%g2
196	and	u1,v1,%g3
197	or	%g3,%g2,%g2
198	srlx	%g2,63,cy
199	add	u2,v2,%g1
200	stx	%g5,[rp+8]
201	add	%g1,cy,%g5
202	or	u2,v2,%g2
203	andn	%g2,%g5,%g2
204	and	u2,v2,%g3
205	or	%g3,%g2,%g2
206	add	rp,32,rp
207	srlx	%g2,63,cy
208	add	u3,v3,%g1
209	stx	%g5,[rp-16]
210	add	%g1,cy,%g5
211	or	u3,v3,%g2
212	andn	%g2,%g5,%g2
213	and	u3,v3,%g3
214	or	%g3,%g2,%g2
215	srlx	%g2,63,cy
216	stx	%g5,[rp-8]
217
218	addcc	n,4,n
219	bz,pn	%xcc,.Lret
220	fanop
221
222.Loop0:	ldx	[up],u0
223	add	up,8,up
224	ldx	[vp],v0
225	add	vp,8,vp
226	add	rp,8,rp
227	subcc	n,1,n
228	add	u0,v0,%g1
229	or	u0,v0,%g2
230	add	%g1,cy,%g5
231	and	u0,v0,%g3
232	andn	%g2,%g5,%g2
233	stx	%g5,[rp-8]
234	or	%g3,%g2,%g2
235	bnz,pt	%xcc,.Loop0
236	srlx	%g2,63,cy
237
238.Lret:	mov	cy,%i0
239	ret
240	restore
241EPILOGUE()
242