1dnl  PowerPC-64 mpn_addmul_1 and mpn_submul_1 optimised for power6.
2
3dnl  Copyright 1999-2001, 2003-2006, 2008, 2010, 2011 Free Software Foundation,
4dnl  Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34C               mpn_addmul_1    mpn_submul_1
35C               cycles/limb     cycles/limb
36C POWER3/PPC630     ?               ?
37C POWER4/PPC970     ?               ?
38C POWER5            ?               ?
39C POWER6           12.25           12.8
40C POWER7            ?               ?
41
42C TODO
43C  * Reduce register usage.
44C  * Schedule function entry code.
45C  * Unroll more.  8-way unrolling would bring us to 10 c/l, 16-way unrolling
46C    would bring us to 9 c/l.
47C  * Handle n = 1 and perhaps n = 2 separately, without saving any registers.
48
49C INPUT PARAMETERS
50define(`rp',  `r3')
51define(`up',  `r4')
52define(`n',   `r5')
53define(`v0',  `r6')
54
55ifdef(`OPERATION_addmul_1',`
56  define(ADDSUBC,	adde)
57  define(ADDSUB,	addc)
58  define(func,		mpn_addmul_1)
59  define(func_nc,	mpn_addmul_1c)	C FIXME: not really supported
60  define(AM,		`$1')
61  define(SM,		`')
62  define(CLRRSC,	`addic	$1, r0, 0')
63')
64ifdef(`OPERATION_submul_1',`
65  define(ADDSUBC,	subfe)
66  define(ADDSUB,	subfc)
67  define(func,		mpn_submul_1)
68  define(func_nc,	mpn_submul_1c)	C FIXME: not really supported
69  define(AM,		`')
70  define(SM,		`$1')
71  define(CLRRSC,	`subfc	$1, r0, r0')
72')
73
74MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
75
76ASM_START()
77PROLOGUE(func)
78	std	r31, -8(r1)
79	std	r30, -16(r1)
80	std	r29, -24(r1)
81	std	r28, -32(r1)
82	std	r27, -40(r1)
83
84	rldicl.	r0, n, 0,62	C r0 = n & 3, set cr0
85	cmpdi	cr6, r0, 2
86	addi	n, n, 3		C compute count...
87	srdi	n, n, 2		C ...for ctr
88	mtctr	n		C copy loop count into ctr
89	beq	cr0, L(b0)
90	blt	cr6, L(b1)
91	beq	cr6, L(b2)
92
93L(b3):	ld	r8, 0(up)
94	ld	r7, 8(up)
95	ld	r27, 16(up)
96	addi	up, up, 16
97	addi	rp, rp, 16
98	mulld	r5,  r8, v0
99	mulhdu	r8,  r8, v0
100	mulld	r9,  r7, v0
101	mulhdu	r7,  r7, v0
102	mulld	r11, r27, v0
103	mulhdu	r27, r27, v0
104	ld	r29, -16(rp)
105	ld	r30, -8(rp)
106	ld	r31, 0(rp)
107	addc	r9, r9, r8
108	adde	r11, r11, r7
109	addze	r12, r27
110	ADDSUB	r5, r5, r29
111	b	L(l3)
112
113L(b2):	ld	r7, 0(up)
114	ld	r27, 8(up)
115	addi	up, up, 8
116	addi	rp, rp, 8
117	mulld	r9,  r7, v0
118	mulhdu	r7,  r7, v0
119	mulld	r11, r27, v0
120	mulhdu	r27, r27, v0
121	ld	r30, -8(rp)
122	ld	r31, 0(rp)
123	addc	r11, r11, r7
124	addze	r12, r27
125	ADDSUB	r9, r9, r30
126	b	L(l2)
127
128L(b1):	ld	r27, 0(up)
129	ld	r31, 0(rp)
130	mulld	r11, r27, v0
131	mulhdu	r12, r27, v0
132	ADDSUB	r11, r11, r31
133	b	L(l1)
134
135L(b0):	addi	up, up, -8
136	addi	rp, rp, -8
137	CLRRSC(	r12)		C clear r12 and clr/set cy
138
139	ALIGN(32)
140L(top):
141SM(`	subfe	r11, r0, r0')	C complement...
142SM(`	addic	r11, r11, 1')	C ...carry flag
143	ld	r10, 8(up)
144	ld	r8, 16(up)
145	ld	r7, 24(up)
146	ld	r27, 32(up)
147	addi	up, up, 32
148	addi	rp, rp, 32
149	mulld	r0,  r10, v0
150	mulhdu	r10, r10, v0
151	mulld	r5,  r8, v0
152	mulhdu	r8,  r8, v0
153	mulld	r9,  r7, v0
154	mulhdu	r7,  r7, v0
155	mulld	r11, r27, v0
156	mulhdu	r27, r27, v0
157	ld	r28, -24(rp)
158	adde	r0, r0, r12
159	ld	r29, -16(rp)
160	adde	r5, r5, r10
161	ld	r30, -8(rp)
162	ld	r31, 0(rp)
163	adde	r9, r9, r8
164	adde	r11, r11, r7
165	addze	r12, r27
166	ADDSUB	r0, r0, r28
167	std	r0, -24(rp)
168	ADDSUBC	r5, r5, r29
169L(l3):	std	r5, -16(rp)
170	ADDSUBC	r9, r9, r30
171L(l2):	std	r9, -8(rp)
172	ADDSUBC	r11, r11, r31
173L(l1):	std	r11, 0(rp)
174	bdnz	L(top)
175
176AM(`	addze	r3, r12')
177SM(`	subfe	r11, r0, r0')		C complement...
178	ld	r31, -8(r1)
179SM(`	subf	r3, r11, r12')
180	ld	r30, -16(r1)
181	ld	r29, -24(r1)
182	ld	r28, -32(r1)
183	ld	r27, -40(r1)
184	blr
185EPILOGUE()
186