1dnl  Alpha mpn_add_n -- Add two limb vectors of the same length > 0 and
2dnl  store sum in a third limb vector.
3
4dnl  Copyright 1995, 1999, 2000, 2005 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C      cycles/limb
24C EV4:     ?
25C EV5:     4.75
26C EV6:     3
27
28dnl  INPUT PARAMETERS
29dnl  res_ptr	r16
30dnl  s1_ptr	r17
31dnl  s2_ptr	r18
32dnl  size	r19
33
34ASM_START()
35PROLOGUE(mpn_add_n)
36	bis	r31,r31,r25		C clear cy
37	subq	r19,4,r19		C decr loop cnt
38	blt	r19,$Lend2		C if less than 4 limbs, goto 2nd loop
39C Start software pipeline for 1st loop
40	ldq	r0,0(r18)
41	ldq	r4,0(r17)
42	ldq	r1,8(r18)
43	ldq	r5,8(r17)
44	addq	r17,32,r17		C update s1_ptr
45	ldq	r2,16(r18)
46	addq	r0,r4,r20		C 1st main add
47	ldq	r3,24(r18)
48	subq	r19,4,r19		C decr loop cnt
49	ldq	r6,-16(r17)
50	cmpult	r20,r0,r25		C compute cy from last add
51	ldq	r7,-8(r17)
52	addq	r1,r5,r28		C 2nd main add
53	addq	r18,32,r18		C update s2_ptr
54	addq	r28,r25,r21		C 2nd carry add
55	cmpult	r28,r5,r8		C compute cy from last add
56	blt	r19,$Lend1		C if less than 4 limbs remain, jump
57C 1st loop handles groups of 4 limbs in a software pipeline
58	ALIGN(16)
59$Loop:	cmpult	r21,r28,r25		C compute cy from last add
60	ldq	r0,0(r18)
61	bis	r8,r25,r25		C combine cy from the two adds
62	ldq	r1,8(r18)
63	addq	r2,r6,r28		C 3rd main add
64	ldq	r4,0(r17)
65	addq	r28,r25,r22		C 3rd carry add
66	ldq	r5,8(r17)
67	cmpult	r28,r6,r8		C compute cy from last add
68	cmpult	r22,r28,r25		C compute cy from last add
69	stq	r20,0(r16)
70	bis	r8,r25,r25		C combine cy from the two adds
71	stq	r21,8(r16)
72	addq	r3,r7,r28		C 4th main add
73	addq	r28,r25,r23		C 4th carry add
74	cmpult	r28,r7,r8		C compute cy from last add
75	cmpult	r23,r28,r25		C compute cy from last add
76		addq	r17,32,r17		C update s1_ptr
77	bis	r8,r25,r25		C combine cy from the two adds
78		addq	r16,32,r16		C update res_ptr
79	addq	r0,r4,r28		C 1st main add
80	ldq	r2,16(r18)
81	addq	r25,r28,r20		C 1st carry add
82	ldq	r3,24(r18)
83	cmpult	r28,r4,r8		C compute cy from last add
84	ldq	r6,-16(r17)
85	cmpult	r20,r28,r25		C compute cy from last add
86	ldq	r7,-8(r17)
87	bis	r8,r25,r25		C combine cy from the two adds
88	subq	r19,4,r19		C decr loop cnt
89	stq	r22,-16(r16)
90	addq	r1,r5,r28		C 2nd main add
91	stq	r23,-8(r16)
92	addq	r25,r28,r21		C 2nd carry add
93		addq	r18,32,r18		C update s2_ptr
94	cmpult	r28,r5,r8		C compute cy from last add
95	bge	r19,$Loop
96C Finish software pipeline for 1st loop
97$Lend1:	cmpult	r21,r28,r25		C compute cy from last add
98	bis	r8,r25,r25		C combine cy from the two adds
99	addq	r2,r6,r28		C 3rd main add
100	addq	r28,r25,r22		C 3rd carry add
101	cmpult	r28,r6,r8		C compute cy from last add
102	cmpult	r22,r28,r25		C compute cy from last add
103	stq	r20,0(r16)
104	bis	r8,r25,r25		C combine cy from the two adds
105	stq	r21,8(r16)
106	addq	r3,r7,r28		C 4th main add
107	addq	r28,r25,r23		C 4th carry add
108	cmpult	r28,r7,r8		C compute cy from last add
109	cmpult	r23,r28,r25		C compute cy from last add
110	bis	r8,r25,r25		C combine cy from the two adds
111	addq	r16,32,r16		C update res_ptr
112	stq	r22,-16(r16)
113	stq	r23,-8(r16)
114$Lend2:	addq	r19,4,r19		C restore loop cnt
115	beq	r19,$Lret
116C Start software pipeline for 2nd loop
117	ldq	r0,0(r18)
118	ldq	r4,0(r17)
119	subq	r19,1,r19
120	beq	r19,$Lend0
121C 2nd loop handles remaining 1-3 limbs
122	ALIGN(16)
123$Loop0:	addq	r0,r4,r28		C main add
124	ldq	r0,8(r18)
125	cmpult	r28,r4,r8		C compute cy from last add
126	ldq	r4,8(r17)
127	addq	r28,r25,r20		C carry add
128	addq	r18,8,r18
129	addq	r17,8,r17
130	stq	r20,0(r16)
131	cmpult	r20,r28,r25		C compute cy from last add
132	subq	r19,1,r19		C decr loop cnt
133	bis	r8,r25,r25		C combine cy from the two adds
134	addq	r16,8,r16
135	bne	r19,$Loop0
136$Lend0:	addq	r0,r4,r28		C main add
137	addq	r28,r25,r20		C carry add
138	cmpult	r28,r4,r8		C compute cy from last add
139	cmpult	r20,r28,r25		C compute cy from last add
140	stq	r20,0(r16)
141	bis	r8,r25,r25		C combine cy from the two adds
142
143$Lret:	bis	r25,r31,r0		C return cy
144	ret	r31,(r26),1
145EPILOGUE(mpn_add_n)
146ASM_END()
147