1dnl  Alpha ev6 mpn_sub_n -- Subtract two limb vectors of the same length > 0
2dnl  and store difference in a third limb vector.
3
4dnl  Copyright 2000, 2003, 2005 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C      cycles/limb
24C EV4:     ?
25C EV5:     5.4
26C EV6:     2.125
27
28C  INPUT PARAMETERS
29C  rp	r16
30C  up	r17
31C  vp	r18
32C  n	r19
33C  cy	r20   (for mpn_add_nc)
34
35C TODO
36C   Finish cleaning up cy registers r22, r23 (make them use cy0/cy1)
37C   Use multi-pronged feed-in.
38C   Perform additional micro-tuning
39
40C  This code was written in cooperation with ev6 pipeline expert Steve Root.
41
42C  Pair loads and stores where possible
43C  Store pairs oct-aligned where possible (didn't need it here)
44C  Stores are delayed every third cycle
45C  Loads and stores are delayed by fills
46C  U stays still, put code there where possible (note alternation of U1 and U0)
47C  L moves because of loads and stores
48C  Note dampers in L to limit damage
49
50C  This odd-looking optimization expects that were having random bits in our
51C  data, so that a pure zero result is unlikely. so we penalize the unlikely
52C  case to help the common case.
53
54define(`u0', `r0')  define(`u1', `r3')
55define(`v0', `r1')  define(`v1', `r4')
56
57define(`cy0', `r20')  define(`cy1', `r21')
58
59MULFUNC_PROLOGUE(mpn_sub_n mpn_sub_nc)
60
61ASM_START()
62PROLOGUE(mpn_sub_nc)
63	br	r31,	$entry
64EPILOGUE()
65PROLOGUE(mpn_sub_n)
66	bis	r31,	r31,	cy0	C clear carry in
67$entry:	cmpult	r19,	5,	r22	C L1 move counter
68	ldq	u1,	0(r17)		C L0 get next ones
69	ldq	v1,	0(r18)		C L1
70	bne	r22,	$Lsmall
71
72	ldq	u0,	8(r17)		C L0 get next ones
73	ldq	v0,	8(r18)		C L1
74	subq	u1,	v1,	r5	C U0 sub two data
75
76	cmpult	u1,	v1,	r23	C U0 did it borrow
77	ldq	u1,	16(r17)		C L0 get next ones
78	ldq	v1,	16(r18)		C L1
79
80	subq	u0,	v0,	r8	C U1 sub two data
81	subq	r5,	cy0,	r24	C U0 borrow in
82
83	cmpult	u0,	v0,	r22	C U1 did it borrow
84	beq	r5,	$fix5f		C U0 fix exact zero
85$ret5f:	ldq	u0,	24(r17)		C L0 get next ones
86	ldq	v0,	24(r18)		C L1
87
88	subq	r8,	r23,	r25	C U1 borrow from last
89	subq	u1,	v1,	r7	C U0 sub two data
90
91	beq	r8,	$fix6f		C U1 fix exact zero
92$ret6f:	cmpult	u1,	v1,	r23	C U0 did it borrow
93	ldq	u1,	32(r17)		C L0 get next ones
94	ldq	v1,	32(r18)		C L1
95
96	lda	r17,	40(r17)		C L0 move pointer
97	lda	r18,	40(r18)		C L1 move pointer
98
99	lda	r16,	-8(r16)
100	lda	r19,	-13(r19)	C L1 move counter
101	blt	r19,	$Lend		C U1 loop control
102
103
104C Main loop.  8-way unrolled.
105	ALIGN(16)
106$Loop:	subq	u0,	v0,	r2	C U1 sub two data
107	stq	r24,	8(r16)		C L0 put an answer
108	subq	r7,	r22,	r24	C U0 borrow from last
109	stq	r25,	16(r16)		C L1 pair
110
111	cmpult	u0,	v0,	cy1	C U1 did it borrow
112	beq	r7,	$fix7		C U0 fix exact 0
113$ret7:	ldq	u0,	0(r17)		C L0 get next ones
114	ldq	v0,	0(r18)		C L1
115
116	bis	r31,	r31,	r31	C L  damp out
117	subq	r2,	r23,	r25	C U1 borrow from last
118	bis	r31,	r31,	r31	C L  moves in L !
119	subq	u1,	v1,	r5	C U0 sub two data
120
121	beq	r2,	$fix0		C U1 fix exact zero
122$ret0:	cmpult	u1,	v1,	cy0	C U0 did it borrow
123	ldq	u1,	8(r17)		C L0 get next ones
124	ldq	v1,	8(r18)		C L1
125
126	subq	u0,	v0,	r8	C U1 sub two data
127	stq	r24,	24(r16)		C L0 store pair
128	subq	r5,	cy1,	r24	C U0 borrow from last
129	stq	r25,	32(r16)		C L1
130
131	cmpult	u0,	v0,	r22	C U1 did it borrow
132	beq	r5,	$fix1		C U0 fix exact zero
133$ret1:	ldq	u0,	16(r17)		C L0 get next ones
134	ldq	v0,	16(r18)		C L1
135
136	lda	r16,	64(r16)		C L0 move pointer
137	subq	r8,	cy0,	r25	C U1 borrow from last
138	lda	r19,	-8(r19)		C L1 move counter
139	subq	u1,	v1,	r7	C U0 sub two data
140
141	beq	r8,	$fix2		C U1 fix exact zero
142$ret2:	cmpult	u1,	v1,	r23	C U0 did it borrow
143	ldq	u1,	24(r17)		C L0 get next ones
144	ldq	v1,	24(r18)		C L1
145
146	subq	u0,	v0,	r2	C U1 sub two data
147	stq	r24,	-24(r16)	C L0 put an answer
148	subq	r7,	r22,	r24	C U0 borrow from last
149	stq	r25,	-16(r16)	C L1 pair
150
151	cmpult	u0,	v0,	cy1	C U1 did it borrow
152	beq	r7,	$fix3		C U0 fix exact 0
153$ret3:	ldq	u0,	32(r17)		C L0 get next ones
154	ldq	v0,	32(r18)		C L1
155
156	bis	r31,	r31,	r31	C L  damp out
157	subq	r2,	r23,	r25	C U1 borrow from last
158	bis	r31,	r31,	r31	C L  moves in L !
159	subq	u1,	v1,	r5	C U0 sub two data
160
161	beq	r2,	$fix4		C U1 fix exact zero
162$ret4:	cmpult	u1,	v1,	cy0	C U0 did it borrow
163	ldq	u1,	40(r17)		C L0 get next ones
164	ldq	v1,	40(r18)		C L1
165
166	subq	u0,	v0,	r8	C U1 sub two data
167	stq	r24,	-8(r16)		C L0 store pair
168	subq	r5,	cy1,	r24	C U0 borrow from last
169	stq	r25,	0(r16)		C L1
170
171	cmpult	u0,	v0,	r22	C U1 did it borrow
172	beq	r5,	$fix5		C U0 fix exact zero
173$ret5:	ldq	u0,	48(r17)		C L0 get next ones
174	ldq	v0,	48(r18)		C L1
175
176	ldl	r31, 256(r17)		C L0 prefetch
177	subq	r8,	cy0,	r25	C U1 borrow from last
178	ldl	r31, 256(r18)		C L1 prefetch
179	subq	u1,	v1,	r7	C U0 sub two data
180
181	beq	r8,	$fix6		C U1 fix exact zero
182$ret6:	cmpult	u1,	v1,	r23	C U0 did it borrow
183	ldq	u1,	56(r17)		C L0 get next ones
184	ldq	v1,	56(r18)		C L1
185
186	lda	r17,	64(r17)		C L0 move pointer
187	bis	r31,	r31,	r31	C U
188	lda	r18,	64(r18)		C L1 move pointer
189	bge	r19,	$Loop		C U1 loop control
190C ==== main loop end
191
192$Lend:	subq	u0,	v0,	r2	C U1 sub two data
193	stq	r24,	8(r16)		C L0 put an answer
194	subq	r7,	r22,	r24	C U0 borrow from last
195	stq	r25,	16(r16)		C L1 pair
196	cmpult	u0,	v0,	cy1	C U1 did it borrow
197	beq	r7,	$fix7c		C U0 fix exact 0
198$ret7c:	subq	r2,	r23,	r25	C U1 borrow from last
199	subq	u1,	v1,	r5	C U0 sub two data
200	beq	r2,	$fix0c		C U1 fix exact zero
201$ret0c:	cmpult	u1,	v1,	cy0	C U0 did it borrow
202	stq	r24,	24(r16)		C L0 store pair
203	subq	r5,	cy1,	r24	C U0 borrow from last
204	stq	r25,	32(r16)		C L1
205	beq	r5,	$fix1c		C U0 fix exact zero
206$ret1c:	stq	r24,	40(r16)		C L0 put an answer
207	lda	r16,	48(r16)		C L0 move pointer
208
209	lda	r19,	8(r19)
210	beq	r19,	$Lret
211
212	ldq	u1,	0(r17)
213	ldq	v1,	0(r18)
214$Lsmall:
215	lda	r19,	-1(r19)
216	beq	r19,	$Lend0
217
218	ALIGN(8)
219$Loop0:	subq	u1,	v1,	r2	C main sub
220	cmpult	u1,	v1,	r8	C compute bw from last sub
221	ldq	u1,	8(r17)
222	ldq	v1,	8(r18)
223	subq	r2,	cy0,	r5	C borrow sub
224	lda	r17,	8(r17)
225	lda	r18,	8(r18)
226	stq	r5,	0(r16)
227	cmpult	r2,	cy0,	cy0	C compute bw from last sub
228	lda	r19,	-1(r19)		C decr loop cnt
229	bis	r8,	cy0,	cy0	C combine bw from the two subs
230	lda	r16,	8(r16)
231	bne	r19,	$Loop0
232$Lend0:	subq	u1,	v1,	r2	C main sub
233	subq	r2,	cy0,	r5	C borrow sub
234	cmpult	u1,	v1,	r8	C compute bw from last sub
235	cmpult	r2,	cy0,	cy0	C compute bw from last sub
236	stq	r5,	0(r16)
237	bis	r8,	cy0,	r0	C combine bw from the two subs
238	ret	r31,(r26),1
239
240	ALIGN(8)
241$Lret:	lda	r0,	0(cy0)		C copy borrow into return register
242	ret	r31,(r26),1
243
244$fix5f:	bis	r23,	cy0,	r23	C bring forward borrow
245	br	r31,	$ret5f
246$fix6f:	bis	r22,	r23,	r22	C bring forward borrow
247	br	r31,	$ret6f
248$fix0:	bis	cy1,	r23,	cy1	C bring forward borrow
249	br	r31,	$ret0
250$fix1:	bis	cy0,	cy1,	cy0	C bring forward borrow
251	br	r31,	$ret1
252$fix2:	bis	r22,	cy0,	r22	C bring forward borrow
253	br	r31,	$ret2
254$fix3:	bis	r23,	r22,	r23	C bring forward borrow
255	br	r31,	$ret3
256$fix4:	bis	cy1,	r23,	cy1	C bring forward borrow
257	br	r31,	$ret4
258$fix5:	bis	cy1,	cy0,	cy0	C bring forward borrow
259	br	r31,	$ret5
260$fix6:	bis	r22,	cy0,	r22	C bring forward borrow
261	br	r31,	$ret6
262$fix7:	bis	r23,	r22,	r23	C bring forward borrow
263	br	r31,	$ret7
264$fix0c:	bis	cy1,	r23,	cy1	C bring forward borrow
265	br	r31,	$ret0c
266$fix1c:	bis	cy0,	cy1,	cy0	C bring forward borrow
267	br	r31,	$ret1c
268$fix7c:	bis	r23,	r22,	r23	C bring forward borrow
269	br	r31,	$ret7c
270
271EPILOGUE()
272ASM_END()
273