1dnl  Alpha ev6 nails mpn_addmul_2.
2
3dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C Runs at 4.0 cycles/limb.
23
24C We could either go for 2-way unrolling over 11 cycles, or 2.75 c/l,
25C or 4-way unrolling over 20 cycles, for 2.5 c/l.
26
27
28C  INPUT PARAMETERS
29define(`rp',`r16')
30define(`up',`r17')
31define(`n',`r18')
32define(`vp',`r19')
33
34C  Useful register aliases
35define(`numb_mask',`r24')
36define(`ulimb',`r25')
37define(`rlimb',`r27')
38
39define(`m0a',`r0')
40define(`m0b',`r1')
41define(`m1a',`r2')
42define(`m1b',`r3')
43
44define(`acc0',`r4')
45define(`acc1',`r5')
46
47define(`v0',`r6')
48define(`v1',`r7')
49
50C Used for temps: r8 r19 r28
51
52define(`NAIL_BITS',`GMP_NAIL_BITS')
53define(`NUMB_BITS',`GMP_NUMB_BITS')
54
55C  This declaration is munged by configure
56NAILS_SUPPORT(3-63)
57
58ASM_START()
59PROLOGUE(mpn_addmul_2)
60	lda	numb_mask,-1(r31)
61	srl	numb_mask,NAIL_BITS,numb_mask
62
63	ldq	v0,	0(vp)
64	ldq	v1,	8(vp)
65
66	bis	r31,	r31,	acc0		C	zero acc0
67	sll	v0,NAIL_BITS,	v0
68	bis	r31,	r31,	acc1		C	zero acc1
69	sll	v1,NAIL_BITS,	v1
70	bis	r31,	r31,	r19
71
72	ldq	ulimb,	0(up)
73	lda	up,	8(up)
74	mulq	v0,	ulimb,	m0a		C U1
75	umulh	v0,	ulimb,	m0b		C U1
76	mulq	v1,	ulimb,	m1a		C U1
77	umulh	v1,	ulimb,	m1b		C U1
78	lda	n,	-1(n)
79	beq	n,	L(end)			C U0
80
81	ALIGN(16)
82L(top):	bis	r31,	r31,	r31		C U1	nop
83	addq	r19,	acc0,	acc0		C U0	propagate nail
84	ldq	rlimb,	0(rp)			C L0
85	ldq	ulimb,	0(up)			C L1
86
87	lda	rp,	8(rp)			C L1
88	srl	m0a,NAIL_BITS,	r8		C U0
89	lda	up,	8(up)			C L0
90	mulq	v0,	ulimb,	m0a		C U1
91
92	addq	r8,	acc0,	r19		C U0
93	addq	m0b,	acc1,	acc0		C L1
94	umulh	v0,	ulimb,	m0b		C U1
95	bis	r31,	r31,	r31		C L0	nop
96
97	addq	rlimb,	r19,	r19		C L1	FINAL PROD-SUM
98	srl	m1a,NAIL_BITS,	r8		C U0
99	lda	n,	-1(n)			C L0
100	mulq	v1,	ulimb,	m1a		C U1
101
102	addq	r8,	acc0,	acc0		C U0
103	bis	r31,	m1b,	acc1		C L1
104	umulh	v1,	ulimb,	m1b		C U1
105	and	r19,numb_mask,	r28		C L0	extract numb part
106
107	unop
108	srl	r19,NUMB_BITS,	r19		C U1	extract nail part
109	stq	r28,	-8(rp)			C L1
110	bne	n,	L(top)			C U0
111
112L(end):	ldq	rlimb,	0(rp)
113	addq	r19,	acc0,	acc0		C	propagate nail
114	lda	rp,	8(rp)
115	srl	m0a,NAIL_BITS,	r8		C U0
116	addq	r8,	acc0,	r19
117	addq	m0b,	acc1,	acc0
118	addq	rlimb,	r19,	r19
119	srl	m1a,NAIL_BITS,	r8		C U0
120	addq	r8,	acc0,	acc0
121	bis	r31,	m1b,	acc1
122	and	r19,numb_mask,	r28		C extract limb
123
124	srl	r19,NUMB_BITS,	r19		C extract nail
125	stq	r28,	-8(rp)
126
127	addq	r19,	acc0,	acc0		C propagate nail
128	and	acc0,numb_mask,	r28
129	stq	r28,	0(rp)
130	srl	acc0,NUMB_BITS,	r19
131	addq	r19,	acc1,	r0
132
133	ret	r31,	(r26),	1
134EPILOGUE()
135ASM_END()
136