1dnl  Alpha ev6 nails mpn_addmul_3.
2
3dnl  Copyright 2002, 2006 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C Runs at 3.0 cycles/limb.
23
24C With 2-way unrolling, we could probably reach 2.25 c/l (3.33 i/c).
25
26
27C  INPUT PARAMETERS
28define(`rp',`r16')
29define(`up',`r17')
30define(`n',`r18')
31define(`vp',`r19')
32
33C  Useful register aliases
34define(`numb_mask',`r24')
35define(`ulimb',`r25')
36define(`rlimb',`r27')
37
38define(`m0a',`r0')
39define(`m0b',`r1')
40define(`m1a',`r2')
41define(`m1b',`r3')
42define(`m2a',`r20')
43define(`m2b',`r21')
44
45define(`acc0',`r4')
46define(`acc1',`r5')
47define(`acc2',`r22')
48
49define(`v0',`r6')
50define(`v1',`r7')
51define(`v2',`r23')
52
53C Used for temps: r8 r19 r28
54
55define(`NAIL_BITS',`GMP_NAIL_BITS')
56define(`NUMB_BITS',`GMP_NUMB_BITS')
57
58C  This declaration is munged by configure
59NAILS_SUPPORT(3-63)
60
61ASM_START()
62PROLOGUE(mpn_addmul_3)
63	lda	numb_mask,-1(r31)
64	srl	numb_mask,NAIL_BITS,numb_mask
65
66	ldq	v0,	0(vp)
67	ldq	v1,	8(vp)
68	ldq	v2,	16(vp)
69
70	bis	r31,	r31,	acc0		C	zero acc0
71	sll	v0,NAIL_BITS,	v0
72	bis	r31,	r31,	acc1		C	zero acc1
73	sll	v1,NAIL_BITS,	v1
74	bis	r31,	r31,	acc2		C	zero acc2
75	sll	v2,NAIL_BITS,	v2
76	bis	r31,	r31,	r19
77
78	ldq	ulimb,	0(up)
79	lda	up,	8(up)
80	mulq	v0,	ulimb,	m0a		C U1
81	umulh	v0,	ulimb,	m0b		C U1
82	mulq	v1,	ulimb,	m1a		C U1
83	umulh	v1,	ulimb,	m1b		C U1
84	lda	n,	-1(n)
85	mulq	v2,	ulimb,	m2a		C U1
86	umulh	v2,	ulimb,	m2b		C U1
87	beq	n,	L(end)			C U0
88
89	ALIGN(16)
90L(top):	ldq	rlimb,	0(rp)			C L1
91	ldq	ulimb,	0(up)			C L0
92	bis	r31,	r31,	r31		C U0	nop
93	addq	r19,	acc0,	acc0		C U1	propagate nail
94
95	lda	rp,	8(rp)			C L1
96	srl	m0a,NAIL_BITS,	r8		C U0
97	lda	up,	8(up)			C L0
98	mulq	v0,	ulimb,	m0a		C U1
99
100	addq	r8,	acc0,	r19		C U0
101	addq	m0b,	acc1,	acc0		C L1
102	umulh	v0,	ulimb,	m0b		C U1
103	bis	r31,	r31,	r31		C L0	nop
104
105	addq	rlimb,	r19,	r19		C L1
106	srl	m1a,NAIL_BITS,	r8		C U0
107	bis	r31,	r31,	r31		C L0	nop
108	mulq	v1,	ulimb,	m1a		C U1
109
110	addq	r8,	acc0,	acc0		C U0
111	addq	m1b,	acc2,	acc1		C L1
112	umulh	v1,	ulimb,	m1b		C U1
113	and	r19,numb_mask,	r28		C L0	extract numb part
114
115	bis	r31,	r31,	r31		C L1	nop
116	srl	m2a,NAIL_BITS,	r8		C U0
117	lda	n,	-1(n)			C L0
118	mulq	v2,	ulimb,	m2a		C U1
119
120	addq	r8,	acc1,	acc1		C L0
121	bis	r31,	m2b,	acc2		C L1
122	umulh	v2,	ulimb,	m2b		C U1
123	srl	r19,NUMB_BITS,	r19		C U0	extract nail part
124
125	stq	r28,	-8(rp)			C L
126	bne	n,	L(top)			C U0
127
128L(end):	ldq	rlimb,	0(rp)
129	addq	r19,	acc0,	acc0		C	propagate nail
130	lda	rp,	8(rp)
131	srl	m0a,NAIL_BITS,	r8		C U0
132	addq	r8,	acc0,	r19
133	addq	m0b,	acc1,	acc0
134	addq	rlimb,	r19,	r19
135	srl	m1a,NAIL_BITS,	r8		C U0
136	addq	r8,	acc0,	acc0
137	addq	m1b,	acc2,	acc1
138	and	r19,numb_mask,	r28		C extract limb
139	srl	m2a,NAIL_BITS,	r8		C U0
140	addq	r8,	acc1,	acc1
141	bis	r31,	m2b,	acc2
142	srl	r19,NUMB_BITS,	r19		C extract nail
143	stq	r28,	-8(rp)
144
145	addq	r19,	acc0,	acc0		C propagate nail
146	and	acc0,numb_mask,	r28
147	stq	r28,	0(rp)
148	srl	acc0,NUMB_BITS,	r19
149	addq	r19,	acc1,	acc1
150
151	and	acc1,numb_mask,	r28
152	stq	r28,	8(rp)
153	srl	acc1,NUMB_BITS,	r19
154	addq	r19,	acc2,	m0a
155
156	ret	r31,	(r26),	1
157EPILOGUE()
158ASM_END()
159