1dnl  SPARC v9 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
2dnl  store difference in a third limb vector.
3
4dnl  Copyright 2001, 2002, 2003 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C		   cycles/limb
24C UltraSPARC 1&2:     4
25C UltraSPARC 3:	      4.5
26
27C Compute carry-out from the most significant bits of u,v, and r, where
28C r=u-v-carry_in, using logic operations.
29
30C This code runs at 4 cycles/limb on UltraSPARC 1 and 2.  It has a 4 insn
31C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated.
32C Therefore, it seems futile to try to optimize this any further...
33
34C INPUT PARAMETERS
35define(`rp',`%i0')
36define(`up',`%i1')
37define(`vp',`%i2')
38define(`n',`%i3')
39
40define(`u0',`%l0')
41define(`u1',`%l2')
42define(`u2',`%l4')
43define(`u3',`%l6')
44define(`v0',`%l1')
45define(`v1',`%l3')
46define(`v2',`%l5')
47define(`v3',`%l7')
48
49define(`cy',`%i4')
50
51define(`fanop',`fitod %f0,%f2')		dnl  A quasi nop running in the FA pipe
52define(`fmnop',`fmuld %f0,%f0,%f4')	dnl  A quasi nop running in the FM pipe
53
54ASM_START()
55	REGISTER(%g2,#scratch)
56	REGISTER(%g3,#scratch)
57PROLOGUE(mpn_sub_n)
58	save	%sp,-160,%sp
59
60	fitod	%f0,%f0		C make sure f0 contains small, quiet number
61	subcc	n,4,%g0
62	bl,pn	%icc,.Loop0
63	mov	0,cy
64
65	ldx	[up+0],u0
66	ldx	[vp+0],v0
67	add	up,32,up
68	ldx	[up-24],u1
69	ldx	[vp+8],v1
70	add	vp,32,vp
71	ldx	[up-16],u2
72	ldx	[vp-16],v2
73	ldx	[up-8],u3
74	ldx	[vp-8],v3
75	subcc	n,8,n
76	sub	u0,v0,%g1	C main sub
77	sub	%g1,cy,%g4	C carry sub
78	orn	u0,v0,%g2
79	bl,pn	%icc,.Lend4567
80	fanop
81	b,a	.Loop
82
83	.align	16
84C START MAIN LOOP
85.Loop:	orn	%g4,%g2,%g2
86	andn	u0,v0,%g3
87	ldx	[up+0],u0
88	fanop
89C --
90	andn	%g2,%g3,%g2
91	ldx	[vp+0],v0
92	add	up,32,up
93	fanop
94C --
95	srlx	%g2,63,cy
96	sub	u1,v1,%g1
97	stx	%g4,[rp+0]
98	fanop
99C --
100	sub	%g1,cy,%g4
101	orn	u1,v1,%g2
102	fmnop
103	fanop
104C --
105	orn	%g4,%g2,%g2
106	andn	u1,v1,%g3
107	ldx	[up-24],u1
108	fanop
109C --
110	andn	%g2,%g3,%g2
111	ldx	[vp+8],v1
112	add	vp,32,vp
113	fanop
114C --
115	srlx	%g2,63,cy
116	sub	u2,v2,%g1
117	stx	%g4,[rp+8]
118	fanop
119C --
120	sub	%g1,cy,%g4
121	orn	u2,v2,%g2
122	fmnop
123	fanop
124C --
125	orn	%g4,%g2,%g2
126	andn	u2,v2,%g3
127	ldx	[up-16],u2
128	fanop
129C --
130	andn	%g2,%g3,%g2
131	ldx	[vp-16],v2
132	add	rp,32,rp
133	fanop
134C --
135	srlx	%g2,63,cy
136	sub	u3,v3,%g1
137	stx	%g4,[rp-16]
138	fanop
139C --
140	sub	%g1,cy,%g4
141	orn	u3,v3,%g2
142	fmnop
143	fanop
144C --
145	orn	%g4,%g2,%g2
146	andn	u3,v3,%g3
147	ldx	[up-8],u3
148	fanop
149C --
150	andn	%g2,%g3,%g2
151	subcc	n,4,n
152	ldx	[vp-8],v3
153	fanop
154C --
155	srlx	%g2,63,cy
156	sub	u0,v0,%g1
157	stx	%g4,[rp-8]
158	fanop
159C --
160	sub	%g1,cy,%g4
161	orn	u0,v0,%g2
162	bge,pt	%icc,.Loop
163	fanop
164C END MAIN LOOP
165.Lend4567:
166	orn	%g4,%g2,%g2
167	andn	u0,v0,%g3
168	andn	%g2,%g3,%g2
169	srlx	%g2,63,cy
170	sub	u1,v1,%g1
171	stx	%g4,[rp+0]
172	sub	%g1,cy,%g4
173	orn	u1,v1,%g2
174	orn	%g4,%g2,%g2
175	andn	u1,v1,%g3
176	andn	%g2,%g3,%g2
177	srlx	%g2,63,cy
178	sub	u2,v2,%g1
179	stx	%g4,[rp+8]
180	sub	%g1,cy,%g4
181	orn	u2,v2,%g2
182	orn	%g4,%g2,%g2
183	andn	u2,v2,%g3
184	andn	%g2,%g3,%g2
185	add	rp,32,rp
186	srlx	%g2,63,cy
187	sub	u3,v3,%g1
188	stx	%g4,[rp-16]
189	sub	%g1,cy,%g4
190	orn	u3,v3,%g2
191	orn	%g4,%g2,%g2
192	andn	u3,v3,%g3
193	andn	%g2,%g3,%g2
194	srlx	%g2,63,cy
195	stx	%g4,[rp-8]
196
197	addcc	n,4,n
198	bz,pn	%icc,.Lret
199	fanop
200
201.Loop0:	ldx	[up],u0
202	add	up,8,up
203	ldx	[vp],v0
204	add	vp,8,vp
205	add	rp,8,rp
206	subcc	n,1,n
207	sub	u0,v0,%g1
208	orn	u0,v0,%g2
209	sub	%g1,cy,%g4
210	andn	u0,v0,%g3
211	orn	%g4,%g2,%g2
212	stx	%g4,[rp-8]
213	andn	%g2,%g3,%g2
214	bnz,pt	%icc,.Loop0
215	srlx	%g2,63,cy
216
217.Lret:	mov	cy,%i0
218	ret
219	restore
220EPILOGUE(mpn_sub_n)
221