1dnl  SPARC mpn_add_n -- Add two limb vectors of the same length > 0 and store
2dnl  sum in a third limb vector.
3
4dnl  Copyright 1995, 1996, 2000 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21
22include(`../config.m4')
23
24C INPUT PARAMETERS
25define(res_ptr,%o0)
26define(s1_ptr,%o1)
27define(s2_ptr,%o2)
28define(n,%o3)
29
30ASM_START()
31PROLOGUE(mpn_add_n)
32	xor	s2_ptr,res_ptr,%g1
33	andcc	%g1,4,%g0
34	bne	L(1)			C branch if alignment differs
35	nop
36C **  V1a  **
37L(0):	andcc	res_ptr,4,%g0		C res_ptr unaligned? Side effect: cy=0
38	be	L(v1)			C if no, branch
39	nop
40C Add least significant limb separately to align res_ptr and s2_ptr
41	ld	[s1_ptr],%g4
42	add	s1_ptr,4,s1_ptr
43	ld	[s2_ptr],%g2
44	add	s2_ptr,4,s2_ptr
45	add	n,-1,n
46	addcc	%g4,%g2,%o4
47	st	%o4,[res_ptr]
48	add	res_ptr,4,res_ptr
49L(v1):	addx	%g0,%g0,%o4		C save cy in register
50	cmp	n,2			C if n < 2 ...
51	bl	L(end2)			C ... branch to tail code
52	subcc	%g0,%o4,%g0		C restore cy
53
54	ld	[s1_ptr+0],%g4
55	addcc	n,-10,n
56	ld	[s1_ptr+4],%g1
57	ldd	[s2_ptr+0],%g2
58	blt	L(fin1)
59	subcc	%g0,%o4,%g0		C restore cy
60C Add blocks of 8 limbs until less than 8 limbs remain
61L(loop1):
62	addxcc	%g4,%g2,%o4
63	ld	[s1_ptr+8],%g4
64	addxcc	%g1,%g3,%o5
65	ld	[s1_ptr+12],%g1
66	ldd	[s2_ptr+8],%g2
67	std	%o4,[res_ptr+0]
68	addxcc	%g4,%g2,%o4
69	ld	[s1_ptr+16],%g4
70	addxcc	%g1,%g3,%o5
71	ld	[s1_ptr+20],%g1
72	ldd	[s2_ptr+16],%g2
73	std	%o4,[res_ptr+8]
74	addxcc	%g4,%g2,%o4
75	ld	[s1_ptr+24],%g4
76	addxcc	%g1,%g3,%o5
77	ld	[s1_ptr+28],%g1
78	ldd	[s2_ptr+24],%g2
79	std	%o4,[res_ptr+16]
80	addxcc	%g4,%g2,%o4
81	ld	[s1_ptr+32],%g4
82	addxcc	%g1,%g3,%o5
83	ld	[s1_ptr+36],%g1
84	ldd	[s2_ptr+32],%g2
85	std	%o4,[res_ptr+24]
86	addx	%g0,%g0,%o4		C save cy in register
87	addcc	n,-8,n
88	add	s1_ptr,32,s1_ptr
89	add	s2_ptr,32,s2_ptr
90	add	res_ptr,32,res_ptr
91	bge	L(loop1)
92	subcc	%g0,%o4,%g0		C restore cy
93
94L(fin1):
95	addcc	n,8-2,n
96	blt	L(end1)
97	subcc	%g0,%o4,%g0		C restore cy
98C Add blocks of 2 limbs until less than 2 limbs remain
99L(loope1):
100	addxcc	%g4,%g2,%o4
101	ld	[s1_ptr+8],%g4
102	addxcc	%g1,%g3,%o5
103	ld	[s1_ptr+12],%g1
104	ldd	[s2_ptr+8],%g2
105	std	%o4,[res_ptr+0]
106	addx	%g0,%g0,%o4		C save cy in register
107	addcc	n,-2,n
108	add	s1_ptr,8,s1_ptr
109	add	s2_ptr,8,s2_ptr
110	add	res_ptr,8,res_ptr
111	bge	L(loope1)
112	subcc	%g0,%o4,%g0		C restore cy
113L(end1):
114	addxcc	%g4,%g2,%o4
115	addxcc	%g1,%g3,%o5
116	std	%o4,[res_ptr+0]
117	addx	%g0,%g0,%o4		C save cy in register
118
119	andcc	n,1,%g0
120	be	L(ret1)
121	subcc	%g0,%o4,%g0		C restore cy
122C Add last limb
123	ld	[s1_ptr+8],%g4
124	ld	[s2_ptr+8],%g2
125	addxcc	%g4,%g2,%o4
126	st	%o4,[res_ptr+8]
127
128L(ret1):
129	retl
130	addx	%g0,%g0,%o0	C return carry-out from most sign. limb
131
132L(1):	xor	s1_ptr,res_ptr,%g1
133	andcc	%g1,4,%g0
134	bne	L(2)
135	nop
136C **  V1b  **
137	mov	s2_ptr,%g1
138	mov	s1_ptr,s2_ptr
139	b	L(0)
140	mov	%g1,s1_ptr
141
142C **  V2  **
143C If we come here, the alignment of s1_ptr and res_ptr as well as the
144C alignment of s2_ptr and res_ptr differ.  Since there are only two ways
145C things can be aligned (that we care about) we now know that the alignment
146C of s1_ptr and s2_ptr are the same.
147
148L(2):	cmp	n,1
149	be	L(jone)
150	nop
151	andcc	s1_ptr,4,%g0		C s1_ptr unaligned? Side effect: cy=0
152	be	L(v2)			C if no, branch
153	nop
154C Add least significant limb separately to align s1_ptr and s2_ptr
155	ld	[s1_ptr],%g4
156	add	s1_ptr,4,s1_ptr
157	ld	[s2_ptr],%g2
158	add	s2_ptr,4,s2_ptr
159	add	n,-1,n
160	addcc	%g4,%g2,%o4
161	st	%o4,[res_ptr]
162	add	res_ptr,4,res_ptr
163
164L(v2):	addx	%g0,%g0,%o4		C save cy in register
165	addcc	n,-8,n
166	blt	L(fin2)
167	subcc	%g0,%o4,%g0		C restore cy
168C Add blocks of 8 limbs until less than 8 limbs remain
169L(loop2):
170	ldd	[s1_ptr+0],%g2
171	ldd	[s2_ptr+0],%o4
172	addxcc	%g2,%o4,%g2
173	st	%g2,[res_ptr+0]
174	addxcc	%g3,%o5,%g3
175	st	%g3,[res_ptr+4]
176	ldd	[s1_ptr+8],%g2
177	ldd	[s2_ptr+8],%o4
178	addxcc	%g2,%o4,%g2
179	st	%g2,[res_ptr+8]
180	addxcc	%g3,%o5,%g3
181	st	%g3,[res_ptr+12]
182	ldd	[s1_ptr+16],%g2
183	ldd	[s2_ptr+16],%o4
184	addxcc	%g2,%o4,%g2
185	st	%g2,[res_ptr+16]
186	addxcc	%g3,%o5,%g3
187	st	%g3,[res_ptr+20]
188	ldd	[s1_ptr+24],%g2
189	ldd	[s2_ptr+24],%o4
190	addxcc	%g2,%o4,%g2
191	st	%g2,[res_ptr+24]
192	addxcc	%g3,%o5,%g3
193	st	%g3,[res_ptr+28]
194	addx	%g0,%g0,%o4		C save cy in register
195	addcc	n,-8,n
196	add	s1_ptr,32,s1_ptr
197	add	s2_ptr,32,s2_ptr
198	add	res_ptr,32,res_ptr
199	bge	L(loop2)
200	subcc	%g0,%o4,%g0		C restore cy
201
202L(fin2):
203	addcc	n,8-2,n
204	blt	L(end2)
205	subcc	%g0,%o4,%g0		C restore cy
206L(loope2):
207	ldd	[s1_ptr+0],%g2
208	ldd	[s2_ptr+0],%o4
209	addxcc	%g2,%o4,%g2
210	st	%g2,[res_ptr+0]
211	addxcc	%g3,%o5,%g3
212	st	%g3,[res_ptr+4]
213	addx	%g0,%g0,%o4		C save cy in register
214	addcc	n,-2,n
215	add	s1_ptr,8,s1_ptr
216	add	s2_ptr,8,s2_ptr
217	add	res_ptr,8,res_ptr
218	bge	L(loope2)
219	subcc	%g0,%o4,%g0		C restore cy
220L(end2):
221	andcc	n,1,%g0
222	be	L(ret2)
223	subcc	%g0,%o4,%g0		C restore cy
224C Add last limb
225L(jone):
226	ld	[s1_ptr],%g4
227	ld	[s2_ptr],%g2
228	addxcc	%g4,%g2,%o4
229	st	%o4,[res_ptr]
230
231L(ret2):
232	retl
233	addx	%g0,%g0,%o0	C return carry-out from most sign. limb
234EPILOGUE(mpn_add_n)
235