1dnl  Alpha ev6 mpn_add_n -- Add two limb vectors of the same length > 0 and
2dnl  store sum in a third limb vector.
3
4dnl  Copyright 2000, 2003, 2005 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C      cycles/limb
24C EV4:     ?
25C EV5:     5.4
26C EV6:     2.125
27
28C  INPUT PARAMETERS
29C  rp	r16
30C  up	r17
31C  vp	r18
32C  n	r19
33C  cy	r20   (for mpn_add_nc)
34
35C TODO
36C   Finish cleaning up cy registers r22, r23 (make them use cy0/cy1)
37C   Use multi-pronged feed-in.
38C   Perform additional micro-tuning
39
40C  This code was written in cooperation with ev6 pipeline expert Steve Root.
41
42C  Pair loads and stores where possible
43C  Store pairs oct-aligned where possible (didn't need it here)
44C  Stores are delayed every third cycle
45C  Loads and stores are delayed by fills
46C  U stays still, put code there where possible (note alternation of U1 and U0)
47C  L moves because of loads and stores
48C  Note dampers in L to limit damage
49
50C  This odd-looking optimization expects that were having random bits in our
51C  data, so that a pure zero result is unlikely. so we penalize the unlikely
52C  case to help the common case.
53
54define(`u0', `r0')  define(`u1', `r3')
55define(`v0', `r1')  define(`v1', `r4')
56
57define(`cy0', `r20')  define(`cy1', `r21')
58
59MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc)
60
61ASM_START()
62PROLOGUE(mpn_add_nc)
63	br	r31,	$entry
64EPILOGUE()
65PROLOGUE(mpn_add_n)
66	bis	r31,	r31,	cy0	C clear carry in
67$entry:	cmpult	r19,	5,	r22	C L1 move counter
68	ldq	u1,	0(r17)		C L0 get next ones
69	ldq	v1,	0(r18)		C L1
70	bne	r22,	$Lsmall
71
72	ldq	u0,	8(r17)		C L0 get next ones
73	ldq	v0,	8(r18)		C L1
74	addq	u1,	v1,	r5	C U0 add two data
75
76	cmpult	r5,	v1,	r23	C U0 did it carry
77	ldq	u1,	16(r17)		C L0 get next ones
78	ldq	v1,	16(r18)		C L1
79
80	addq	u0,	v0,	r8	C U1 add two data
81	addq	r5,	cy0,	r5	C U0 carry in
82
83	cmpult	r8,	v0,	r22	C U1 did it carry
84	beq	r5,	$fix5f		C U0 fix exact zero
85$ret5f:	ldq	u0,	24(r17)		C L0 get next ones
86	ldq	v0,	24(r18)		C L1
87
88	addq	r8,	r23,	r8	C U1 carry from last
89	addq	u1,	v1,	r7	C U0 add two data
90
91	beq	r8,	$fix6f		C U1 fix exact zero
92$ret6f:	cmpult	r7,	v1,	r23	C U0 did it carry
93	ldq	u1,	32(r17)		C L0 get next ones
94	ldq	v1,	32(r18)		C L1
95
96	lda	r17,	40(r17)		C L0 move pointer
97	lda	r18,	40(r18)		C L1 move pointer
98
99	lda	r16,	-8(r16)
100	lda	r19,	-13(r19)	C L1 move counter
101	blt	r19,	$Lend		C U1 loop control
102
103
104C Main loop.  8-way unrolled.
105	ALIGN(16)
106$Loop:	addq	u0,	v0,	r2	C U1 add two data
107	addq	r7,	r22,	r7	C U0 add in carry
108	stq	r5,	8(r16)		C L0 put an answer
109	stq	r8,	16(r16)		C L1 pair
110
111	cmpult	r2,	v0,	cy1	C U1 did it carry
112	beq	r7,	$fix7		C U0 fix exact 0
113$ret7:	ldq	u0,	0(r17)		C L0 get next ones
114	ldq	v0,	0(r18)		C L1
115
116	bis	r31,	r31,	r31	C L  damp out
117	addq	r2,	r23,	r2	C U1 carry from last
118	bis	r31,	r31,	r31	C L  moves in L !
119	addq	u1,	v1,	r5	C U0 add two data
120
121	beq	r2,	$fix0		C U1 fix exact zero
122$ret0:	cmpult	r5,	v1,	cy0	C U0 did it carry
123	ldq	u1,	8(r17)		C L0 get next ones
124	ldq	v1,	8(r18)		C L1
125
126	addq	u0,	v0,	r8	C U1 add two data
127	addq	r5,	cy1,	r5	C U0 carry from last
128	stq	r7,	24(r16)		C L0 store pair
129	stq	r2,	32(r16)		C L1
130
131	cmpult	r8,	v0,	r22	C U1 did it carry
132	beq	r5,	$fix1		C U0 fix exact zero
133$ret1:	ldq	u0,	16(r17)		C L0 get next ones
134	ldq	v0,	16(r18)		C L1
135
136	lda	r16,	64(r16)		C L0 move pointer
137	addq	r8,	cy0,	r8	C U1 carry from last
138	lda	r19,	-8(r19)		C L1 move counter
139	addq	u1,	v1,	r7	C U0 add two data
140
141	beq	r8,	$fix2		C U1 fix exact zero
142$ret2:	cmpult	r7,	v1,	r23	C U0 did it carry
143	ldq	u1,	24(r17)		C L0 get next ones
144	ldq	v1,	24(r18)		C L1
145
146	addq	u0,	v0,	r2	C U1 add two data
147	addq	r7,	r22,	r7	C U0 add in carry
148	stq	r5,	-24(r16)	C L0 put an answer
149	stq	r8,	-16(r16)	C L1 pair
150
151	cmpult	r2,	v0,	cy1	C U1 did it carry
152	beq	r7,	$fix3		C U0 fix exact 0
153$ret3:	ldq	u0,	32(r17)		C L0 get next ones
154	ldq	v0,	32(r18)		C L1
155
156	bis	r31,	r31,	r31	C L  damp out
157	addq	r2,	r23,	r2	C U1 carry from last
158	bis	r31,	r31,	r31	C L  moves in L !
159	addq	u1,	v1,	r5	C U0 add two data
160
161	beq	r2,	$fix4		C U1 fix exact zero
162$ret4:	cmpult	r5,	v1,	cy0	C U0 did it carry
163	ldq	u1,	40(r17)		C L0 get next ones
164	ldq	v1,	40(r18)		C L1
165
166	addq	u0,	v0,	r8	C U1 add two data
167	addq	r5,	cy1,	r5	C U0 carry from last
168	stq	r7,	-8(r16)		C L0 store pair
169	stq	r2,	0(r16)		C L1
170
171	cmpult	r8,	v0,	r22	C U1 did it carry
172	beq	r5,	$fix5		C U0 fix exact zero
173$ret5:	ldq	u0,	48(r17)		C L0 get next ones
174	ldq	v0,	48(r18)		C L1
175
176	ldl	r31, 256(r17)		C L0 prefetch
177	addq	r8,	cy0,	r8	C U1 carry from last
178	ldl	r31, 256(r18)		C L1 prefetch
179	addq	u1,	v1,	r7	C U0 add two data
180
181	beq	r8,	$fix6		C U1 fix exact zero
182$ret6:	cmpult	r7,	v1,	r23	C U0 did it carry
183	ldq	u1,	56(r17)		C L0 get next ones
184	ldq	v1,	56(r18)		C L1
185
186	lda	r17,	64(r17)		C L0 move pointer
187	bis	r31,	r31,	r31	C U
188	lda	r18,	64(r18)		C L1 move pointer
189	bge	r19,	$Loop		C U1 loop control
190C ==== main loop end
191
192$Lend:	addq	u0,	v0,	r2	C U1 add two data
193	addq	r7,	r22,	r7	C U0 add in carry
194	stq	r5,	8(r16)		C L0 put an answer
195	stq	r8,	16(r16)		C L1 pair
196	cmpult	r2,	v0,	cy1	C U1 did it carry
197	beq	r7,	$fix7c		C U0 fix exact 0
198$ret7c:	addq	r2,	r23,	r2	C U1 carry from last
199	addq	u1,	v1,	r5	C U0 add two data
200	beq	r2,	$fix0c		C U1 fix exact zero
201$ret0c:	cmpult	r5,	v1,	cy0	C U0 did it carry
202	addq	r5,	cy1,	r5	C U0 carry from last
203	stq	r7,	24(r16)		C L0 store pair
204	stq	r2,	32(r16)		C L1
205	beq	r5,	$fix1c		C U0 fix exact zero
206$ret1c:	stq	r5,	40(r16)		C L0 put an answer
207	lda	r16,	48(r16)		C L0 move pointer
208
209	lda	r19,	8(r19)
210	beq	r19,	$Lret
211
212	ldq	u1,	0(r17)
213	ldq	v1,	0(r18)
214$Lsmall:
215	lda	r19,	-1(r19)
216	beq	r19,	$Lend0
217
218	ALIGN(8)
219$Loop0:	addq	u1,	v1,	r2	C main add
220	cmpult	r2,	v1,	r8	C compute cy from last add
221	ldq	u1,	8(r17)
222	ldq	v1,	8(r18)
223	addq	r2,	cy0,	r5	C carry add
224	lda	r17,	8(r17)
225	lda	r18,	8(r18)
226	stq	r5,	0(r16)
227	cmpult	r5,	r2,	cy0	C compute cy from last add
228	lda	r19,	-1(r19)		C decr loop cnt
229	bis	r8,	cy0,	cy0	C combine cy from the two adds
230	lda	r16,	8(r16)
231	bne	r19,	$Loop0
232$Lend0:	addq	u1,	v1,	r2	C main add
233	addq	r2,	cy0,	r5	C carry add
234	cmpult	r2,	v1,	r8	C compute cy from last add
235	cmpult	r5,	r2,	cy0	C compute cy from last add
236	stq	r5,	0(r16)
237	bis	r8,	cy0,	r0	C combine cy from the two adds
238	ret	r31,(r26),1
239
240	ALIGN(8)
241$Lret:	lda	r0,	0(cy0)		C copy carry into return register
242	ret	r31,(r26),1
243
244$fix5f:	bis	r23,	cy0,	r23	C bring forward carry
245	br	r31,	$ret5f
246$fix6f:	bis	r22,	r23,	r22	C bring forward carry
247	br	r31,	$ret6f
248$fix0:	bis	cy1,	r23,	cy1	C bring forward carry
249	br	r31,	$ret0
250$fix1:	bis	cy0,	cy1,	cy0	C bring forward carry
251	br	r31,	$ret1
252$fix2:	bis	r22,	cy0,	r22	C bring forward carry
253	br	r31,	$ret2
254$fix3:	bis	r23,	r22,	r23	C bring forward carry
255	br	r31,	$ret3
256$fix4:	bis	cy1,	r23,	cy1	C bring forward carry
257	br	r31,	$ret4
258$fix5:	bis	cy1,	cy0,	cy0	C bring forward carry
259	br	r31,	$ret5
260$fix6:	bis	r22,	cy0,	r22	C bring forward carry
261	br	r31,	$ret6
262$fix7:	bis	r23,	r22,	r23	C bring forward carry
263	br	r31,	$ret7
264$fix0c:	bis	cy1,	r23,	cy1	C bring forward carry
265	br	r31,	$ret0c
266$fix1c:	bis	cy0,	cy1,	cy0	C bring forward carry
267	br	r31,	$ret1c
268$fix7c:	bis	r23,	r22,	r23	C bring forward carry
269	br	r31,	$ret7c
270
271EPILOGUE()
272ASM_END()
273