1dnl  SPARC mpn_add_n -- Add two limb vectors of the same length > 0 and store
2dnl  sum in a third limb vector.
3
4dnl  Copyright 1995, 1996, 2000 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32
33include(`../config.m4')
34
35C INPUT PARAMETERS
36define(res_ptr,%o0)
37define(s1_ptr,%o1)
38define(s2_ptr,%o2)
39define(n,%o3)
40
41ASM_START()
42PROLOGUE(mpn_add_n)
43	xor	s2_ptr,res_ptr,%g1
44	andcc	%g1,4,%g0
45	bne	L(1)			C branch if alignment differs
46	nop
47C **  V1a  **
48L(0):	andcc	res_ptr,4,%g0		C res_ptr unaligned? Side effect: cy=0
49	be	L(v1)			C if no, branch
50	nop
51C Add least significant limb separately to align res_ptr and s2_ptr
52	ld	[s1_ptr],%g4
53	add	s1_ptr,4,s1_ptr
54	ld	[s2_ptr],%g2
55	add	s2_ptr,4,s2_ptr
56	add	n,-1,n
57	addcc	%g4,%g2,%o4
58	st	%o4,[res_ptr]
59	add	res_ptr,4,res_ptr
60L(v1):	addx	%g0,%g0,%o4		C save cy in register
61	cmp	n,2			C if n < 2 ...
62	bl	L(end2)			C ... branch to tail code
63	subcc	%g0,%o4,%g0		C restore cy
64
65	ld	[s1_ptr+0],%g4
66	addcc	n,-10,n
67	ld	[s1_ptr+4],%g1
68	ldd	[s2_ptr+0],%g2
69	blt	L(fin1)
70	subcc	%g0,%o4,%g0		C restore cy
71C Add blocks of 8 limbs until less than 8 limbs remain
72L(loop1):
73	addxcc	%g4,%g2,%o4
74	ld	[s1_ptr+8],%g4
75	addxcc	%g1,%g3,%o5
76	ld	[s1_ptr+12],%g1
77	ldd	[s2_ptr+8],%g2
78	std	%o4,[res_ptr+0]
79	addxcc	%g4,%g2,%o4
80	ld	[s1_ptr+16],%g4
81	addxcc	%g1,%g3,%o5
82	ld	[s1_ptr+20],%g1
83	ldd	[s2_ptr+16],%g2
84	std	%o4,[res_ptr+8]
85	addxcc	%g4,%g2,%o4
86	ld	[s1_ptr+24],%g4
87	addxcc	%g1,%g3,%o5
88	ld	[s1_ptr+28],%g1
89	ldd	[s2_ptr+24],%g2
90	std	%o4,[res_ptr+16]
91	addxcc	%g4,%g2,%o4
92	ld	[s1_ptr+32],%g4
93	addxcc	%g1,%g3,%o5
94	ld	[s1_ptr+36],%g1
95	ldd	[s2_ptr+32],%g2
96	std	%o4,[res_ptr+24]
97	addx	%g0,%g0,%o4		C save cy in register
98	addcc	n,-8,n
99	add	s1_ptr,32,s1_ptr
100	add	s2_ptr,32,s2_ptr
101	add	res_ptr,32,res_ptr
102	bge	L(loop1)
103	subcc	%g0,%o4,%g0		C restore cy
104
105L(fin1):
106	addcc	n,8-2,n
107	blt	L(end1)
108	subcc	%g0,%o4,%g0		C restore cy
109C Add blocks of 2 limbs until less than 2 limbs remain
110L(loope1):
111	addxcc	%g4,%g2,%o4
112	ld	[s1_ptr+8],%g4
113	addxcc	%g1,%g3,%o5
114	ld	[s1_ptr+12],%g1
115	ldd	[s2_ptr+8],%g2
116	std	%o4,[res_ptr+0]
117	addx	%g0,%g0,%o4		C save cy in register
118	addcc	n,-2,n
119	add	s1_ptr,8,s1_ptr
120	add	s2_ptr,8,s2_ptr
121	add	res_ptr,8,res_ptr
122	bge	L(loope1)
123	subcc	%g0,%o4,%g0		C restore cy
124L(end1):
125	addxcc	%g4,%g2,%o4
126	addxcc	%g1,%g3,%o5
127	std	%o4,[res_ptr+0]
128	addx	%g0,%g0,%o4		C save cy in register
129
130	andcc	n,1,%g0
131	be	L(ret1)
132	subcc	%g0,%o4,%g0		C restore cy
133C Add last limb
134	ld	[s1_ptr+8],%g4
135	ld	[s2_ptr+8],%g2
136	addxcc	%g4,%g2,%o4
137	st	%o4,[res_ptr+8]
138
139L(ret1):
140	retl
141	addx	%g0,%g0,%o0	C return carry-out from most sign. limb
142
143L(1):	xor	s1_ptr,res_ptr,%g1
144	andcc	%g1,4,%g0
145	bne	L(2)
146	nop
147C **  V1b  **
148	mov	s2_ptr,%g1
149	mov	s1_ptr,s2_ptr
150	b	L(0)
151	mov	%g1,s1_ptr
152
153C **  V2  **
154C If we come here, the alignment of s1_ptr and res_ptr as well as the
155C alignment of s2_ptr and res_ptr differ.  Since there are only two ways
156C things can be aligned (that we care about) we now know that the alignment
157C of s1_ptr and s2_ptr are the same.
158
159L(2):	cmp	n,1
160	be	L(jone)
161	nop
162	andcc	s1_ptr,4,%g0		C s1_ptr unaligned? Side effect: cy=0
163	be	L(v2)			C if no, branch
164	nop
165C Add least significant limb separately to align s1_ptr and s2_ptr
166	ld	[s1_ptr],%g4
167	add	s1_ptr,4,s1_ptr
168	ld	[s2_ptr],%g2
169	add	s2_ptr,4,s2_ptr
170	add	n,-1,n
171	addcc	%g4,%g2,%o4
172	st	%o4,[res_ptr]
173	add	res_ptr,4,res_ptr
174
175L(v2):	addx	%g0,%g0,%o4		C save cy in register
176	addcc	n,-8,n
177	blt	L(fin2)
178	subcc	%g0,%o4,%g0		C restore cy
179C Add blocks of 8 limbs until less than 8 limbs remain
180L(loop2):
181	ldd	[s1_ptr+0],%g2
182	ldd	[s2_ptr+0],%o4
183	addxcc	%g2,%o4,%g2
184	st	%g2,[res_ptr+0]
185	addxcc	%g3,%o5,%g3
186	st	%g3,[res_ptr+4]
187	ldd	[s1_ptr+8],%g2
188	ldd	[s2_ptr+8],%o4
189	addxcc	%g2,%o4,%g2
190	st	%g2,[res_ptr+8]
191	addxcc	%g3,%o5,%g3
192	st	%g3,[res_ptr+12]
193	ldd	[s1_ptr+16],%g2
194	ldd	[s2_ptr+16],%o4
195	addxcc	%g2,%o4,%g2
196	st	%g2,[res_ptr+16]
197	addxcc	%g3,%o5,%g3
198	st	%g3,[res_ptr+20]
199	ldd	[s1_ptr+24],%g2
200	ldd	[s2_ptr+24],%o4
201	addxcc	%g2,%o4,%g2
202	st	%g2,[res_ptr+24]
203	addxcc	%g3,%o5,%g3
204	st	%g3,[res_ptr+28]
205	addx	%g0,%g0,%o4		C save cy in register
206	addcc	n,-8,n
207	add	s1_ptr,32,s1_ptr
208	add	s2_ptr,32,s2_ptr
209	add	res_ptr,32,res_ptr
210	bge	L(loop2)
211	subcc	%g0,%o4,%g0		C restore cy
212
213L(fin2):
214	addcc	n,8-2,n
215	blt	L(end2)
216	subcc	%g0,%o4,%g0		C restore cy
217L(loope2):
218	ldd	[s1_ptr+0],%g2
219	ldd	[s2_ptr+0],%o4
220	addxcc	%g2,%o4,%g2
221	st	%g2,[res_ptr+0]
222	addxcc	%g3,%o5,%g3
223	st	%g3,[res_ptr+4]
224	addx	%g0,%g0,%o4		C save cy in register
225	addcc	n,-2,n
226	add	s1_ptr,8,s1_ptr
227	add	s2_ptr,8,s2_ptr
228	add	res_ptr,8,res_ptr
229	bge	L(loope2)
230	subcc	%g0,%o4,%g0		C restore cy
231L(end2):
232	andcc	n,1,%g0
233	be	L(ret2)
234	subcc	%g0,%o4,%g0		C restore cy
235C Add last limb
236L(jone):
237	ld	[s1_ptr],%g4
238	ld	[s2_ptr],%g2
239	addxcc	%g4,%g2,%o4
240	st	%o4,[res_ptr]
241
242L(ret2):
243	retl
244	addx	%g0,%g0,%o0	C return carry-out from most sign. limb
245EPILOGUE(mpn_add_n)
246