sub_n.asm revision 1.1.1.1
1dnl  SPARC mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
2dnl  store difference in a third limb vector.
3
4dnl  Copyright 1995, 1996, 2000 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21
22include(`../config.m4')
23
24C INPUT PARAMETERS
25define(res_ptr,%o0)
26define(s1_ptr,%o1)
27define(s2_ptr,%o2)
28define(n,%o3)
29
30ASM_START()
31PROLOGUE(mpn_sub_n)
32	xor	s2_ptr,res_ptr,%g1
33	andcc	%g1,4,%g0
34	bne	L(1)			C branch if alignment differs
35	nop
36C **  V1a  **
37	andcc	res_ptr,4,%g0		C res_ptr unaligned? Side effect: cy=0
38	be	L(v1)			C if no, branch
39	nop
40C Add least significant limb separately to align res_ptr and s2_ptr
41	ld	[s1_ptr],%g4
42	add	s1_ptr,4,s1_ptr
43	ld	[s2_ptr],%g2
44	add	s2_ptr,4,s2_ptr
45	add	n,-1,n
46	subcc	%g4,%g2,%o4
47	st	%o4,[res_ptr]
48	add	res_ptr,4,res_ptr
49L(v1):	addx	%g0,%g0,%o4		C save cy in register
50	cmp	n,2			C if n < 2 ...
51	bl	L(end2)			C ... branch to tail code
52	subcc	%g0,%o4,%g0		C restore cy
53
54	ld	[s1_ptr+0],%g4
55	addcc	n,-10,n
56	ld	[s1_ptr+4],%g1
57	ldd	[s2_ptr+0],%g2
58	blt	L(fin1)
59	subcc	%g0,%o4,%g0		C restore cy
60C Add blocks of 8 limbs until less than 8 limbs remain
61L(loop1):
62	subxcc	%g4,%g2,%o4
63	ld	[s1_ptr+8],%g4
64	subxcc	%g1,%g3,%o5
65	ld	[s1_ptr+12],%g1
66	ldd	[s2_ptr+8],%g2
67	std	%o4,[res_ptr+0]
68	subxcc	%g4,%g2,%o4
69	ld	[s1_ptr+16],%g4
70	subxcc	%g1,%g3,%o5
71	ld	[s1_ptr+20],%g1
72	ldd	[s2_ptr+16],%g2
73	std	%o4,[res_ptr+8]
74	subxcc	%g4,%g2,%o4
75	ld	[s1_ptr+24],%g4
76	subxcc	%g1,%g3,%o5
77	ld	[s1_ptr+28],%g1
78	ldd	[s2_ptr+24],%g2
79	std	%o4,[res_ptr+16]
80	subxcc	%g4,%g2,%o4
81	ld	[s1_ptr+32],%g4
82	subxcc	%g1,%g3,%o5
83	ld	[s1_ptr+36],%g1
84	ldd	[s2_ptr+32],%g2
85	std	%o4,[res_ptr+24]
86	addx	%g0,%g0,%o4		C save cy in register
87	addcc	n,-8,n
88	add	s1_ptr,32,s1_ptr
89	add	s2_ptr,32,s2_ptr
90	add	res_ptr,32,res_ptr
91	bge	L(loop1)
92	subcc	%g0,%o4,%g0		C restore cy
93
94L(fin1):
95	addcc	n,8-2,n
96	blt	L(end1)
97	subcc	%g0,%o4,%g0		C restore cy
98C Add blocks of 2 limbs until less than 2 limbs remain
99L(loope1):
100	subxcc	%g4,%g2,%o4
101	ld	[s1_ptr+8],%g4
102	subxcc	%g1,%g3,%o5
103	ld	[s1_ptr+12],%g1
104	ldd	[s2_ptr+8],%g2
105	std	%o4,[res_ptr+0]
106	addx	%g0,%g0,%o4		C save cy in register
107	addcc	n,-2,n
108	add	s1_ptr,8,s1_ptr
109	add	s2_ptr,8,s2_ptr
110	add	res_ptr,8,res_ptr
111	bge	L(loope1)
112	subcc	%g0,%o4,%g0		C restore cy
113L(end1):
114	subxcc	%g4,%g2,%o4
115	subxcc	%g1,%g3,%o5
116	std	%o4,[res_ptr+0]
117	addx	%g0,%g0,%o4		C save cy in register
118
119	andcc	n,1,%g0
120	be	L(ret1)
121	subcc	%g0,%o4,%g0		C restore cy
122C Add last limb
123	ld	[s1_ptr+8],%g4
124	ld	[s2_ptr+8],%g2
125	subxcc	%g4,%g2,%o4
126	st	%o4,[res_ptr+8]
127
128L(ret1):
129	retl
130	addx	%g0,%g0,%o0	C return carry-out from most sign. limb
131
132L(1):	xor	s1_ptr,res_ptr,%g1
133	andcc	%g1,4,%g0
134	bne	L(2)
135	nop
136C **  V1b  **
137	andcc	res_ptr,4,%g0		C res_ptr unaligned? Side effect: cy=0
138	be	L(v1b)			C if no, branch
139	nop
140C Add least significant limb separately to align res_ptr and s1_ptr
141	ld	[s2_ptr],%g4
142	add	s2_ptr,4,s2_ptr
143	ld	[s1_ptr],%g2
144	add	s1_ptr,4,s1_ptr
145	add	n,-1,n
146	subcc	%g2,%g4,%o4
147	st	%o4,[res_ptr]
148	add	res_ptr,4,res_ptr
149L(v1b):	addx	%g0,%g0,%o4		C save cy in register
150	cmp	n,2			C if n < 2 ...
151	bl	L(end2)			C ... branch to tail code
152	subcc	%g0,%o4,%g0		C restore cy
153
154	ld	[s2_ptr+0],%g4
155	addcc	n,-10,n
156	ld	[s2_ptr+4],%g1
157	ldd	[s1_ptr+0],%g2
158	blt	L(fin1b)
159	subcc	%g0,%o4,%g0		C restore cy
160C Add blocks of 8 limbs until less than 8 limbs remain
161L(loop1b):
162	subxcc	%g2,%g4,%o4
163	ld	[s2_ptr+8],%g4
164	subxcc	%g3,%g1,%o5
165	ld	[s2_ptr+12],%g1
166	ldd	[s1_ptr+8],%g2
167	std	%o4,[res_ptr+0]
168	subxcc	%g2,%g4,%o4
169	ld	[s2_ptr+16],%g4
170	subxcc	%g3,%g1,%o5
171	ld	[s2_ptr+20],%g1
172	ldd	[s1_ptr+16],%g2
173	std	%o4,[res_ptr+8]
174	subxcc	%g2,%g4,%o4
175	ld	[s2_ptr+24],%g4
176	subxcc	%g3,%g1,%o5
177	ld	[s2_ptr+28],%g1
178	ldd	[s1_ptr+24],%g2
179	std	%o4,[res_ptr+16]
180	subxcc	%g2,%g4,%o4
181	ld	[s2_ptr+32],%g4
182	subxcc	%g3,%g1,%o5
183	ld	[s2_ptr+36],%g1
184	ldd	[s1_ptr+32],%g2
185	std	%o4,[res_ptr+24]
186	addx	%g0,%g0,%o4		C save cy in register
187	addcc	n,-8,n
188	add	s1_ptr,32,s1_ptr
189	add	s2_ptr,32,s2_ptr
190	add	res_ptr,32,res_ptr
191	bge	L(loop1b)
192	subcc	%g0,%o4,%g0		C restore cy
193
194L(fin1b):
195	addcc	n,8-2,n
196	blt	L(end1b)
197	subcc	%g0,%o4,%g0		C restore cy
198C Add blocks of 2 limbs until less than 2 limbs remain
199L(loope1b):
200	subxcc	%g2,%g4,%o4
201	ld	[s2_ptr+8],%g4
202	subxcc	%g3,%g1,%o5
203	ld	[s2_ptr+12],%g1
204	ldd	[s1_ptr+8],%g2
205	std	%o4,[res_ptr+0]
206	addx	%g0,%g0,%o4		C save cy in register
207	addcc	n,-2,n
208	add	s1_ptr,8,s1_ptr
209	add	s2_ptr,8,s2_ptr
210	add	res_ptr,8,res_ptr
211	bge	L(loope1b)
212	subcc	%g0,%o4,%g0		C restore cy
213L(end1b):
214	subxcc	%g2,%g4,%o4
215	subxcc	%g3,%g1,%o5
216	std	%o4,[res_ptr+0]
217	addx	%g0,%g0,%o4		C save cy in register
218
219	andcc	n,1,%g0
220	be	L(ret1b)
221	subcc	%g0,%o4,%g0		C restore cy
222C Add last limb
223	ld	[s2_ptr+8],%g4
224	ld	[s1_ptr+8],%g2
225	subxcc	%g2,%g4,%o4
226	st	%o4,[res_ptr+8]
227
228L(ret1b):
229	retl
230	addx	%g0,%g0,%o0		C return carry-out from most sign. limb
231
232C **  V2  **
233C If we come here, the alignment of s1_ptr and res_ptr as well as the
234C alignment of s2_ptr and res_ptr differ.  Since there are only two ways
235C things can be aligned (that we care about) we now know that the alignment
236C of s1_ptr and s2_ptr are the same.
237
238L(2):	cmp	n,1
239	be	L(jone)
240	nop
241	andcc	s1_ptr,4,%g0		C s1_ptr unaligned? Side effect: cy=0
242	be	L(v2)			C if no, branch
243	nop
244C Add least significant limb separately to align s1_ptr and s2_ptr
245	ld	[s1_ptr],%g4
246	add	s1_ptr,4,s1_ptr
247	ld	[s2_ptr],%g2
248	add	s2_ptr,4,s2_ptr
249	add	n,-1,n
250	subcc	%g4,%g2,%o4
251	st	%o4,[res_ptr]
252	add	res_ptr,4,res_ptr
253
254L(v2):	addx	%g0,%g0,%o4		C save cy in register
255	addcc	n,-8,n
256	blt	L(fin2)
257	subcc	%g0,%o4,%g0		C restore cy
258C Add blocks of 8 limbs until less than 8 limbs remain
259L(loop2):
260	ldd	[s1_ptr+0],%g2
261	ldd	[s2_ptr+0],%o4
262	subxcc	%g2,%o4,%g2
263	st	%g2,[res_ptr+0]
264	subxcc	%g3,%o5,%g3
265	st	%g3,[res_ptr+4]
266	ldd	[s1_ptr+8],%g2
267	ldd	[s2_ptr+8],%o4
268	subxcc	%g2,%o4,%g2
269	st	%g2,[res_ptr+8]
270	subxcc	%g3,%o5,%g3
271	st	%g3,[res_ptr+12]
272	ldd	[s1_ptr+16],%g2
273	ldd	[s2_ptr+16],%o4
274	subxcc	%g2,%o4,%g2
275	st	%g2,[res_ptr+16]
276	subxcc	%g3,%o5,%g3
277	st	%g3,[res_ptr+20]
278	ldd	[s1_ptr+24],%g2
279	ldd	[s2_ptr+24],%o4
280	subxcc	%g2,%o4,%g2
281	st	%g2,[res_ptr+24]
282	subxcc	%g3,%o5,%g3
283	st	%g3,[res_ptr+28]
284	addx	%g0,%g0,%o4		C save cy in register
285	addcc	n,-8,n
286	add	s1_ptr,32,s1_ptr
287	add	s2_ptr,32,s2_ptr
288	add	res_ptr,32,res_ptr
289	bge	L(loop2)
290	subcc	%g0,%o4,%g0		C restore cy
291
292L(fin2):
293	addcc	n,8-2,n
294	blt	L(end2)
295	subcc	%g0,%o4,%g0		C restore cy
296L(loope2):
297	ldd	[s1_ptr+0],%g2
298	ldd	[s2_ptr+0],%o4
299	subxcc	%g2,%o4,%g2
300	st	%g2,[res_ptr+0]
301	subxcc	%g3,%o5,%g3
302	st	%g3,[res_ptr+4]
303	addx	%g0,%g0,%o4		C save cy in register
304	addcc	n,-2,n
305	add	s1_ptr,8,s1_ptr
306	add	s2_ptr,8,s2_ptr
307	add	res_ptr,8,res_ptr
308	bge	L(loope2)
309	subcc	%g0,%o4,%g0		C restore cy
310L(end2):
311	andcc	n,1,%g0
312	be	L(ret2)
313	subcc	%g0,%o4,%g0		C restore cy
314C Add last limb
315L(jone):
316	ld	[s1_ptr],%g4
317	ld	[s2_ptr],%g2
318	subxcc	%g4,%g2,%o4
319	st	%o4,[res_ptr]
320
321L(ret2):
322	retl
323	addx	%g0,%g0,%o0		C return carry-out from most sign. limb
324EPILOGUE(mpn_sub_n)
325