1dnl  Alpha ev6 nails mpn_addmul_3.
2
3dnl  Copyright 2002, 2006 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C Runs at 3.0 cycles/limb.
34
35C With 2-way unrolling, we could probably reach 2.25 c/l (3.33 i/c).
36
37
38C  INPUT PARAMETERS
39define(`rp',`r16')
40define(`up',`r17')
41define(`n',`r18')
42define(`vp',`r19')
43
44C  Useful register aliases
45define(`numb_mask',`r24')
46define(`ulimb',`r25')
47define(`rlimb',`r27')
48
49define(`m0a',`r0')
50define(`m0b',`r1')
51define(`m1a',`r2')
52define(`m1b',`r3')
53define(`m2a',`r20')
54define(`m2b',`r21')
55
56define(`acc0',`r4')
57define(`acc1',`r5')
58define(`acc2',`r22')
59
60define(`v0',`r6')
61define(`v1',`r7')
62define(`v2',`r23')
63
64C Used for temps: r8 r19 r28
65
66define(`NAIL_BITS',`GMP_NAIL_BITS')
67define(`NUMB_BITS',`GMP_NUMB_BITS')
68
69C  This declaration is munged by configure
70NAILS_SUPPORT(3-63)
71
72ASM_START()
73PROLOGUE(mpn_addmul_3)
74	lda	numb_mask,-1(r31)
75	srl	numb_mask,NAIL_BITS,numb_mask
76
77	ldq	v0,	0(vp)
78	ldq	v1,	8(vp)
79	ldq	v2,	16(vp)
80
81	bis	r31,	r31,	acc0		C	zero acc0
82	sll	v0,NAIL_BITS,	v0
83	bis	r31,	r31,	acc1		C	zero acc1
84	sll	v1,NAIL_BITS,	v1
85	bis	r31,	r31,	acc2		C	zero acc2
86	sll	v2,NAIL_BITS,	v2
87	bis	r31,	r31,	r19
88
89	ldq	ulimb,	0(up)
90	lda	up,	8(up)
91	mulq	v0,	ulimb,	m0a		C U1
92	umulh	v0,	ulimb,	m0b		C U1
93	mulq	v1,	ulimb,	m1a		C U1
94	umulh	v1,	ulimb,	m1b		C U1
95	lda	n,	-1(n)
96	mulq	v2,	ulimb,	m2a		C U1
97	umulh	v2,	ulimb,	m2b		C U1
98	beq	n,	L(end)			C U0
99
100	ALIGN(16)
101L(top):	ldq	rlimb,	0(rp)			C L1
102	ldq	ulimb,	0(up)			C L0
103	bis	r31,	r31,	r31		C U0	nop
104	addq	r19,	acc0,	acc0		C U1	propagate nail
105
106	lda	rp,	8(rp)			C L1
107	srl	m0a,NAIL_BITS,	r8		C U0
108	lda	up,	8(up)			C L0
109	mulq	v0,	ulimb,	m0a		C U1
110
111	addq	r8,	acc0,	r19		C U0
112	addq	m0b,	acc1,	acc0		C L1
113	umulh	v0,	ulimb,	m0b		C U1
114	bis	r31,	r31,	r31		C L0	nop
115
116	addq	rlimb,	r19,	r19		C L1
117	srl	m1a,NAIL_BITS,	r8		C U0
118	bis	r31,	r31,	r31		C L0	nop
119	mulq	v1,	ulimb,	m1a		C U1
120
121	addq	r8,	acc0,	acc0		C U0
122	addq	m1b,	acc2,	acc1		C L1
123	umulh	v1,	ulimb,	m1b		C U1
124	and	r19,numb_mask,	r28		C L0	extract numb part
125
126	bis	r31,	r31,	r31		C L1	nop
127	srl	m2a,NAIL_BITS,	r8		C U0
128	lda	n,	-1(n)			C L0
129	mulq	v2,	ulimb,	m2a		C U1
130
131	addq	r8,	acc1,	acc1		C L0
132	bis	r31,	m2b,	acc2		C L1
133	umulh	v2,	ulimb,	m2b		C U1
134	srl	r19,NUMB_BITS,	r19		C U0	extract nail part
135
136	stq	r28,	-8(rp)			C L
137	bne	n,	L(top)			C U0
138
139L(end):	ldq	rlimb,	0(rp)
140	addq	r19,	acc0,	acc0		C	propagate nail
141	lda	rp,	8(rp)
142	srl	m0a,NAIL_BITS,	r8		C U0
143	addq	r8,	acc0,	r19
144	addq	m0b,	acc1,	acc0
145	addq	rlimb,	r19,	r19
146	srl	m1a,NAIL_BITS,	r8		C U0
147	addq	r8,	acc0,	acc0
148	addq	m1b,	acc2,	acc1
149	and	r19,numb_mask,	r28		C extract limb
150	srl	m2a,NAIL_BITS,	r8		C U0
151	addq	r8,	acc1,	acc1
152	bis	r31,	m2b,	acc2
153	srl	r19,NUMB_BITS,	r19		C extract nail
154	stq	r28,	-8(rp)
155
156	addq	r19,	acc0,	acc0		C propagate nail
157	and	acc0,numb_mask,	r28
158	stq	r28,	0(rp)
159	srl	acc0,NUMB_BITS,	r19
160	addq	r19,	acc1,	acc1
161
162	and	acc1,numb_mask,	r28
163	stq	r28,	8(rp)
164	srl	acc1,NUMB_BITS,	r19
165	addq	r19,	acc2,	m0a
166
167	ret	r31,	(r26),	1
168EPILOGUE()
169ASM_END()
170