1dnl  Alpha ev6 nails mpn_addmul_4.
2
3dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C Runs at 2.5 cycles/limb.
23
24C We should go for 2-way unrolling over 17 cycles, for 2.125 c/l corresponding
25C to 3.24 insn/cycle.
26
27
28C  INPUT PARAMETERS
29define(`rp',`r16')
30define(`up',`r17')
31define(`n',`r18')
32define(`vp',`r19')
33
34C  Useful register aliases
35define(`numb_mask',`r24')
36define(`ulimb',`r25')
37define(`rlimb',`r27')
38
39define(`m0a',`r0')
40define(`m0b',`r1')
41define(`m1a',`r2')
42define(`m1b',`r3')
43define(`m2a',`r20')
44define(`m2b',`r21')
45define(`m3a',`r12')
46define(`m3b',`r13')
47
48define(`acc0',`r4')
49define(`acc1',`r5')
50define(`acc2',`r22')
51define(`acc3',`r14')
52
53define(`v0',`r6')
54define(`v1',`r7')
55define(`v2',`r23')
56define(`v3',`r15')
57
58C Used for temps: r8 r19 r28
59
60define(`NAIL_BITS',`GMP_NAIL_BITS')
61define(`NUMB_BITS',`GMP_NUMB_BITS')
62
63C  This declaration is munged by configure
64NAILS_SUPPORT(4-63)
65
66ASM_START()
67PROLOGUE(mpn_addmul_4)
68	lda	r30,	-240(r30)
69	stq	r12,	32(r30)
70	stq	r13,	40(r30)
71	stq	r14,	48(r30)
72	stq	r15,	56(r30)
73
74	lda	numb_mask,-1(r31)
75	srl	numb_mask,NAIL_BITS,numb_mask
76
77	ldq	v0,	0(vp)
78	ldq	v1,	8(vp)
79	ldq	v2,	16(vp)
80	ldq	v3,	24(vp)
81
82	bis	r31,	r31,	acc0		C	zero acc0
83	sll	v0,NAIL_BITS,	v0
84	bis	r31,	r31,	acc1		C	zero acc1
85	sll	v1,NAIL_BITS,	v1
86	bis	r31,	r31,	acc2		C	zero acc2
87	sll	v2,NAIL_BITS,	v2
88	bis	r31,	r31,	acc3		C	zero acc3
89	sll	v3,NAIL_BITS,	v3
90	bis	r31,	r31,	r19
91
92	ldq	ulimb,	0(up)
93	lda	up,	8(up)
94	mulq	v0,	ulimb,	m0a		C U1
95	umulh	v0,	ulimb,	m0b		C U1
96	mulq	v1,	ulimb,	m1a		C U1
97	umulh	v1,	ulimb,	m1b		C U1
98	lda	n,	-1(n)
99	mulq	v2,	ulimb,	m2a		C U1
100	umulh	v2,	ulimb,	m2b		C U1
101	mulq	v3,	ulimb,	m3a		C U1
102	umulh	v3,	ulimb,	m3b		C U1
103	beq	n,	L(end)			C U0
104
105	ALIGN(16)
106L(top):	bis	r31,	r31,	r31		C U1	nop
107	ldq	rlimb,	0(rp)			C L0
108	ldq	ulimb,	0(up)			C L1
109	addq	r19,	acc0,	acc0		C U0	propagate nail
110
111	bis	r31,	r31,	r31		C L0	nop
112	bis	r31,	r31,	r31		C U1	nop
113	bis	r31,	r31,	r31		C L1	nop
114	bis	r31,	r31,	r31		C U0	nop
115
116	lda	rp,	8(rp)			C L0
117	srl	m0a,NAIL_BITS,	r8		C U0
118	lda	up,	8(up)			C L1
119	mulq	v0,	ulimb,	m0a		C U1
120
121	addq	r8,	acc0,	r19		C U0
122	addq	m0b,	acc1,	acc0		C L0
123	umulh	v0,	ulimb,	m0b		C U1
124	bis	r31,	r31,	r31		C L1	nop
125
126	addq	rlimb,	r19,	r19		C L0
127	srl	m1a,NAIL_BITS,	r8		C U0
128	bis	r31,	r31,	r31		C L1	nop
129	mulq	v1,	ulimb,	m1a		C U1
130
131	addq	r8,	acc0,	acc0		C U0
132	addq	m1b,	acc2,	acc1		C L0
133	umulh	v1,	ulimb,	m1b		C U1
134	and	r19,numb_mask,	r28		C L1	extract numb part
135
136	bis	r31,	r31,	r31		C L0	nop
137	srl	m2a,NAIL_BITS,	r8		C U0
138	lda	n,	-1(n)			C L1
139	mulq	v2,	ulimb,	m2a		C U1
140
141	addq	r8,	acc1,	acc1		C L1
142	addq	m2b,	acc3,	acc2		C L0
143	umulh	v2,	ulimb,	m2b		C U1
144	srl	r19,NUMB_BITS,	r19		C U0	extract nail part
145
146	bis	r31,	r31,	r31		C L0	nop
147	srl	m3a,NAIL_BITS,	r8		C U0
148	stq	r28,	-8(rp)			C L1
149	mulq	v3,	ulimb,	m3a		C U1
150
151	addq	r8,	acc2,	acc2		C L0
152	bis	r31,	m3b,	acc3		C L1
153	umulh	v3,	ulimb,	m3b		C U1
154	bne	n,	L(top)			C U0
155
156L(end):	ldq	rlimb,	0(rp)
157	addq	r19,	acc0,	acc0		C	propagate nail
158	lda	rp,	8(rp)			C FIXME: DELETE
159	srl	m0a,NAIL_BITS,	r8		C U0
160	addq	r8,	acc0,	r19
161	addq	m0b,	acc1,	acc0
162	addq	rlimb,	r19,	r19
163	srl	m1a,NAIL_BITS,	r8		C U0
164	addq	r8,	acc0,	acc0
165	addq	m1b,	acc2,	acc1
166	and	r19,numb_mask,	r28		C extract limb
167	srl	m2a,NAIL_BITS,	r8		C U0
168	addq	r8,	acc1,	acc1
169	addq	m2b,	acc3,	acc2
170	srl	r19,NUMB_BITS,	r19		C extract nail
171	srl	m3a,NAIL_BITS,	r8		C U0
172	stq	r28,	-8(rp)
173	addq	r8,	acc2,	acc2
174	bis	r31,	m3b,	acc3
175
176	addq	r19,	acc0,	acc0		C propagate nail
177	and	acc0,numb_mask,	r28
178	stq	r28,	0(rp)
179	srl	acc0,NUMB_BITS,	r19
180	addq	r19,	acc1,	acc1
181
182	and	acc1,numb_mask,	r28
183	stq	r28,	8(rp)
184	srl	acc1,NUMB_BITS,	r19
185	addq	r19,	acc2,	acc2
186
187	and	acc2,numb_mask,	r28
188	stq	r28,	16(rp)
189	srl	acc2,NUMB_BITS,	r19
190	addq	r19,	acc3,	r0
191
192	ldq	r12,	32(r30)
193	ldq	r13,	40(r30)
194	ldq	r14,	48(r30)
195	ldq	r15,	56(r30)
196	lda	r30,	240(r30)
197	ret	r31,	(r26),	1
198EPILOGUE()
199ASM_END()
200