1dnl  SPARC mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
2dnl  store difference in a third limb vector.
3
4dnl  Copyright 1995, 1996, 2000 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32
33include(`../config.m4')
34
35C INPUT PARAMETERS
36define(res_ptr,%o0)
37define(s1_ptr,%o1)
38define(s2_ptr,%o2)
39define(n,%o3)
40
41ASM_START()
42PROLOGUE(mpn_sub_n)
43	xor	s2_ptr,res_ptr,%g1
44	andcc	%g1,4,%g0
45	bne	L(1)			C branch if alignment differs
46	nop
47C **  V1a  **
48	andcc	res_ptr,4,%g0		C res_ptr unaligned? Side effect: cy=0
49	be	L(v1)			C if no, branch
50	nop
51C Add least significant limb separately to align res_ptr and s2_ptr
52	ld	[s1_ptr],%g4
53	add	s1_ptr,4,s1_ptr
54	ld	[s2_ptr],%g2
55	add	s2_ptr,4,s2_ptr
56	add	n,-1,n
57	subcc	%g4,%g2,%o4
58	st	%o4,[res_ptr]
59	add	res_ptr,4,res_ptr
60L(v1):	addx	%g0,%g0,%o4		C save cy in register
61	cmp	n,2			C if n < 2 ...
62	bl	L(end2)			C ... branch to tail code
63	subcc	%g0,%o4,%g0		C restore cy
64
65	ld	[s1_ptr+0],%g4
66	addcc	n,-10,n
67	ld	[s1_ptr+4],%g1
68	ldd	[s2_ptr+0],%g2
69	blt	L(fin1)
70	subcc	%g0,%o4,%g0		C restore cy
71C Add blocks of 8 limbs until less than 8 limbs remain
72L(loop1):
73	subxcc	%g4,%g2,%o4
74	ld	[s1_ptr+8],%g4
75	subxcc	%g1,%g3,%o5
76	ld	[s1_ptr+12],%g1
77	ldd	[s2_ptr+8],%g2
78	std	%o4,[res_ptr+0]
79	subxcc	%g4,%g2,%o4
80	ld	[s1_ptr+16],%g4
81	subxcc	%g1,%g3,%o5
82	ld	[s1_ptr+20],%g1
83	ldd	[s2_ptr+16],%g2
84	std	%o4,[res_ptr+8]
85	subxcc	%g4,%g2,%o4
86	ld	[s1_ptr+24],%g4
87	subxcc	%g1,%g3,%o5
88	ld	[s1_ptr+28],%g1
89	ldd	[s2_ptr+24],%g2
90	std	%o4,[res_ptr+16]
91	subxcc	%g4,%g2,%o4
92	ld	[s1_ptr+32],%g4
93	subxcc	%g1,%g3,%o5
94	ld	[s1_ptr+36],%g1
95	ldd	[s2_ptr+32],%g2
96	std	%o4,[res_ptr+24]
97	addx	%g0,%g0,%o4		C save cy in register
98	addcc	n,-8,n
99	add	s1_ptr,32,s1_ptr
100	add	s2_ptr,32,s2_ptr
101	add	res_ptr,32,res_ptr
102	bge	L(loop1)
103	subcc	%g0,%o4,%g0		C restore cy
104
105L(fin1):
106	addcc	n,8-2,n
107	blt	L(end1)
108	subcc	%g0,%o4,%g0		C restore cy
109C Add blocks of 2 limbs until less than 2 limbs remain
110L(loope1):
111	subxcc	%g4,%g2,%o4
112	ld	[s1_ptr+8],%g4
113	subxcc	%g1,%g3,%o5
114	ld	[s1_ptr+12],%g1
115	ldd	[s2_ptr+8],%g2
116	std	%o4,[res_ptr+0]
117	addx	%g0,%g0,%o4		C save cy in register
118	addcc	n,-2,n
119	add	s1_ptr,8,s1_ptr
120	add	s2_ptr,8,s2_ptr
121	add	res_ptr,8,res_ptr
122	bge	L(loope1)
123	subcc	%g0,%o4,%g0		C restore cy
124L(end1):
125	subxcc	%g4,%g2,%o4
126	subxcc	%g1,%g3,%o5
127	std	%o4,[res_ptr+0]
128	addx	%g0,%g0,%o4		C save cy in register
129
130	andcc	n,1,%g0
131	be	L(ret1)
132	subcc	%g0,%o4,%g0		C restore cy
133C Add last limb
134	ld	[s1_ptr+8],%g4
135	ld	[s2_ptr+8],%g2
136	subxcc	%g4,%g2,%o4
137	st	%o4,[res_ptr+8]
138
139L(ret1):
140	retl
141	addx	%g0,%g0,%o0	C return carry-out from most sign. limb
142
143L(1):	xor	s1_ptr,res_ptr,%g1
144	andcc	%g1,4,%g0
145	bne	L(2)
146	nop
147C **  V1b  **
148	andcc	res_ptr,4,%g0		C res_ptr unaligned? Side effect: cy=0
149	be	L(v1b)			C if no, branch
150	nop
151C Add least significant limb separately to align res_ptr and s1_ptr
152	ld	[s2_ptr],%g4
153	add	s2_ptr,4,s2_ptr
154	ld	[s1_ptr],%g2
155	add	s1_ptr,4,s1_ptr
156	add	n,-1,n
157	subcc	%g2,%g4,%o4
158	st	%o4,[res_ptr]
159	add	res_ptr,4,res_ptr
160L(v1b):	addx	%g0,%g0,%o4		C save cy in register
161	cmp	n,2			C if n < 2 ...
162	bl	L(end2)			C ... branch to tail code
163	subcc	%g0,%o4,%g0		C restore cy
164
165	ld	[s2_ptr+0],%g4
166	addcc	n,-10,n
167	ld	[s2_ptr+4],%g1
168	ldd	[s1_ptr+0],%g2
169	blt	L(fin1b)
170	subcc	%g0,%o4,%g0		C restore cy
171C Add blocks of 8 limbs until less than 8 limbs remain
172L(loop1b):
173	subxcc	%g2,%g4,%o4
174	ld	[s2_ptr+8],%g4
175	subxcc	%g3,%g1,%o5
176	ld	[s2_ptr+12],%g1
177	ldd	[s1_ptr+8],%g2
178	std	%o4,[res_ptr+0]
179	subxcc	%g2,%g4,%o4
180	ld	[s2_ptr+16],%g4
181	subxcc	%g3,%g1,%o5
182	ld	[s2_ptr+20],%g1
183	ldd	[s1_ptr+16],%g2
184	std	%o4,[res_ptr+8]
185	subxcc	%g2,%g4,%o4
186	ld	[s2_ptr+24],%g4
187	subxcc	%g3,%g1,%o5
188	ld	[s2_ptr+28],%g1
189	ldd	[s1_ptr+24],%g2
190	std	%o4,[res_ptr+16]
191	subxcc	%g2,%g4,%o4
192	ld	[s2_ptr+32],%g4
193	subxcc	%g3,%g1,%o5
194	ld	[s2_ptr+36],%g1
195	ldd	[s1_ptr+32],%g2
196	std	%o4,[res_ptr+24]
197	addx	%g0,%g0,%o4		C save cy in register
198	addcc	n,-8,n
199	add	s1_ptr,32,s1_ptr
200	add	s2_ptr,32,s2_ptr
201	add	res_ptr,32,res_ptr
202	bge	L(loop1b)
203	subcc	%g0,%o4,%g0		C restore cy
204
205L(fin1b):
206	addcc	n,8-2,n
207	blt	L(end1b)
208	subcc	%g0,%o4,%g0		C restore cy
209C Add blocks of 2 limbs until less than 2 limbs remain
210L(loope1b):
211	subxcc	%g2,%g4,%o4
212	ld	[s2_ptr+8],%g4
213	subxcc	%g3,%g1,%o5
214	ld	[s2_ptr+12],%g1
215	ldd	[s1_ptr+8],%g2
216	std	%o4,[res_ptr+0]
217	addx	%g0,%g0,%o4		C save cy in register
218	addcc	n,-2,n
219	add	s1_ptr,8,s1_ptr
220	add	s2_ptr,8,s2_ptr
221	add	res_ptr,8,res_ptr
222	bge	L(loope1b)
223	subcc	%g0,%o4,%g0		C restore cy
224L(end1b):
225	subxcc	%g2,%g4,%o4
226	subxcc	%g3,%g1,%o5
227	std	%o4,[res_ptr+0]
228	addx	%g0,%g0,%o4		C save cy in register
229
230	andcc	n,1,%g0
231	be	L(ret1b)
232	subcc	%g0,%o4,%g0		C restore cy
233C Add last limb
234	ld	[s2_ptr+8],%g4
235	ld	[s1_ptr+8],%g2
236	subxcc	%g2,%g4,%o4
237	st	%o4,[res_ptr+8]
238
239L(ret1b):
240	retl
241	addx	%g0,%g0,%o0		C return carry-out from most sign. limb
242
243C **  V2  **
244C If we come here, the alignment of s1_ptr and res_ptr as well as the
245C alignment of s2_ptr and res_ptr differ.  Since there are only two ways
246C things can be aligned (that we care about) we now know that the alignment
247C of s1_ptr and s2_ptr are the same.
248
249L(2):	cmp	n,1
250	be	L(jone)
251	nop
252	andcc	s1_ptr,4,%g0		C s1_ptr unaligned? Side effect: cy=0
253	be	L(v2)			C if no, branch
254	nop
255C Add least significant limb separately to align s1_ptr and s2_ptr
256	ld	[s1_ptr],%g4
257	add	s1_ptr,4,s1_ptr
258	ld	[s2_ptr],%g2
259	add	s2_ptr,4,s2_ptr
260	add	n,-1,n
261	subcc	%g4,%g2,%o4
262	st	%o4,[res_ptr]
263	add	res_ptr,4,res_ptr
264
265L(v2):	addx	%g0,%g0,%o4		C save cy in register
266	addcc	n,-8,n
267	blt	L(fin2)
268	subcc	%g0,%o4,%g0		C restore cy
269C Add blocks of 8 limbs until less than 8 limbs remain
270L(loop2):
271	ldd	[s1_ptr+0],%g2
272	ldd	[s2_ptr+0],%o4
273	subxcc	%g2,%o4,%g2
274	st	%g2,[res_ptr+0]
275	subxcc	%g3,%o5,%g3
276	st	%g3,[res_ptr+4]
277	ldd	[s1_ptr+8],%g2
278	ldd	[s2_ptr+8],%o4
279	subxcc	%g2,%o4,%g2
280	st	%g2,[res_ptr+8]
281	subxcc	%g3,%o5,%g3
282	st	%g3,[res_ptr+12]
283	ldd	[s1_ptr+16],%g2
284	ldd	[s2_ptr+16],%o4
285	subxcc	%g2,%o4,%g2
286	st	%g2,[res_ptr+16]
287	subxcc	%g3,%o5,%g3
288	st	%g3,[res_ptr+20]
289	ldd	[s1_ptr+24],%g2
290	ldd	[s2_ptr+24],%o4
291	subxcc	%g2,%o4,%g2
292	st	%g2,[res_ptr+24]
293	subxcc	%g3,%o5,%g3
294	st	%g3,[res_ptr+28]
295	addx	%g0,%g0,%o4		C save cy in register
296	addcc	n,-8,n
297	add	s1_ptr,32,s1_ptr
298	add	s2_ptr,32,s2_ptr
299	add	res_ptr,32,res_ptr
300	bge	L(loop2)
301	subcc	%g0,%o4,%g0		C restore cy
302
303L(fin2):
304	addcc	n,8-2,n
305	blt	L(end2)
306	subcc	%g0,%o4,%g0		C restore cy
307L(loope2):
308	ldd	[s1_ptr+0],%g2
309	ldd	[s2_ptr+0],%o4
310	subxcc	%g2,%o4,%g2
311	st	%g2,[res_ptr+0]
312	subxcc	%g3,%o5,%g3
313	st	%g3,[res_ptr+4]
314	addx	%g0,%g0,%o4		C save cy in register
315	addcc	n,-2,n
316	add	s1_ptr,8,s1_ptr
317	add	s2_ptr,8,s2_ptr
318	add	res_ptr,8,res_ptr
319	bge	L(loope2)
320	subcc	%g0,%o4,%g0		C restore cy
321L(end2):
322	andcc	n,1,%g0
323	be	L(ret2)
324	subcc	%g0,%o4,%g0		C restore cy
325C Add last limb
326L(jone):
327	ld	[s1_ptr],%g4
328	ld	[s2_ptr],%g2
329	subxcc	%g4,%g2,%o4
330	st	%o4,[res_ptr]
331
332L(ret2):
333	retl
334	addx	%g0,%g0,%o0		C return carry-out from most sign. limb
335EPILOGUE(mpn_sub_n)
336