1dnl  Alpha ev6 nails mpn_addmul_2.
2
3dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C Runs at 4.0 cycles/limb.
34
35C We could either go for 2-way unrolling over 11 cycles, or 2.75 c/l,
36C or 4-way unrolling over 20 cycles, for 2.5 c/l.
37
38
39C  INPUT PARAMETERS
40define(`rp',`r16')
41define(`up',`r17')
42define(`n',`r18')
43define(`vp',`r19')
44
45C  Useful register aliases
46define(`numb_mask',`r24')
47define(`ulimb',`r25')
48define(`rlimb',`r27')
49
50define(`m0a',`r0')
51define(`m0b',`r1')
52define(`m1a',`r2')
53define(`m1b',`r3')
54
55define(`acc0',`r4')
56define(`acc1',`r5')
57
58define(`v0',`r6')
59define(`v1',`r7')
60
61C Used for temps: r8 r19 r28
62
63define(`NAIL_BITS',`GMP_NAIL_BITS')
64define(`NUMB_BITS',`GMP_NUMB_BITS')
65
66C  This declaration is munged by configure
67NAILS_SUPPORT(3-63)
68
69ASM_START()
70PROLOGUE(mpn_addmul_2)
71	lda	numb_mask,-1(r31)
72	srl	numb_mask,NAIL_BITS,numb_mask
73
74	ldq	v0,	0(vp)
75	ldq	v1,	8(vp)
76
77	bis	r31,	r31,	acc0		C	zero acc0
78	sll	v0,NAIL_BITS,	v0
79	bis	r31,	r31,	acc1		C	zero acc1
80	sll	v1,NAIL_BITS,	v1
81	bis	r31,	r31,	r19
82
83	ldq	ulimb,	0(up)
84	lda	up,	8(up)
85	mulq	v0,	ulimb,	m0a		C U1
86	umulh	v0,	ulimb,	m0b		C U1
87	mulq	v1,	ulimb,	m1a		C U1
88	umulh	v1,	ulimb,	m1b		C U1
89	lda	n,	-1(n)
90	beq	n,	L(end)			C U0
91
92	ALIGN(16)
93L(top):	bis	r31,	r31,	r31		C U1	nop
94	addq	r19,	acc0,	acc0		C U0	propagate nail
95	ldq	rlimb,	0(rp)			C L0
96	ldq	ulimb,	0(up)			C L1
97
98	lda	rp,	8(rp)			C L1
99	srl	m0a,NAIL_BITS,	r8		C U0
100	lda	up,	8(up)			C L0
101	mulq	v0,	ulimb,	m0a		C U1
102
103	addq	r8,	acc0,	r19		C U0
104	addq	m0b,	acc1,	acc0		C L1
105	umulh	v0,	ulimb,	m0b		C U1
106	bis	r31,	r31,	r31		C L0	nop
107
108	addq	rlimb,	r19,	r19		C L1	FINAL PROD-SUM
109	srl	m1a,NAIL_BITS,	r8		C U0
110	lda	n,	-1(n)			C L0
111	mulq	v1,	ulimb,	m1a		C U1
112
113	addq	r8,	acc0,	acc0		C U0
114	bis	r31,	m1b,	acc1		C L1
115	umulh	v1,	ulimb,	m1b		C U1
116	and	r19,numb_mask,	r28		C L0	extract numb part
117
118	unop
119	srl	r19,NUMB_BITS,	r19		C U1	extract nail part
120	stq	r28,	-8(rp)			C L1
121	bne	n,	L(top)			C U0
122
123L(end):	ldq	rlimb,	0(rp)
124	addq	r19,	acc0,	acc0		C	propagate nail
125	lda	rp,	8(rp)
126	srl	m0a,NAIL_BITS,	r8		C U0
127	addq	r8,	acc0,	r19
128	addq	m0b,	acc1,	acc0
129	addq	rlimb,	r19,	r19
130	srl	m1a,NAIL_BITS,	r8		C U0
131	addq	r8,	acc0,	acc0
132	bis	r31,	m1b,	acc1
133	and	r19,numb_mask,	r28		C extract limb
134
135	srl	r19,NUMB_BITS,	r19		C extract nail
136	stq	r28,	-8(rp)
137
138	addq	r19,	acc0,	acc0		C propagate nail
139	and	acc0,numb_mask,	r28
140	stq	r28,	0(rp)
141	srl	acc0,NUMB_BITS,	r19
142	addq	r19,	acc1,	r0
143
144	ret	r31,	(r26),	1
145EPILOGUE()
146ASM_END()
147