1dnl  PowerPC-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add
2dnl  the result to a second limb vector.
3
4dnl  Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006 Free Software
5dnl  Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C		cycles/limb
25C POWER3/PPC630:    6-18
26C POWER4/PPC970:     8
27C POWER5:            8
28
29C TODO
30C  * Reduce the number of registers used.  Some mul destination registers could
31C    be coalesced.
32C  * Delay std for preserving registers, and suppress them for n=1.
33C  * Write faster feed-in code.  If nothing else, avoid one or two up updates.
34
35C INPUT PARAMETERS
36define(`rp', `r3')
37define(`up', `r4')
38define(`n', `r5')
39define(`vl', `r6')
40
41ASM_START()
42PROLOGUE(mpn_addmul_1)
43	std	r31, -8(r1)
44	std	r30, -16(r1)
45	std	r29, -24(r1)
46	std	r28, -32(r1)
47	std	r27, -40(r1)
48	std	r26, -48(r1)
49
50	rldicl.	r0, n, 0,62	C r0 = n & 3, set cr0
51	cmpdi	cr6, r0, 2
52	addi	n, n, 3		C compute count...
53	srdi	n, n, 2		C ...for ctr
54	mtctr	n		C copy count into ctr
55	beq	cr0, L(b00)
56	blt	cr6, L(b01)
57	beq	cr6, L(b10)
58
59L(b11):	ld	r26, 0(up)
60	ld	r28, 0(rp)
61	addi	up, up, 8
62	nop
63	mulld	r0, r26, r6
64	mulhdu	r12, r26, r6
65	addc	r0, r0, r28
66	std	r0, 0(rp)
67	addi	rp, rp, 8
68	b	L(fic)
69
70L(b00):	ld	r26, 0(up)
71	ld	r27, 8(up)
72	ld	r28, 0(rp)
73	ld	r29, 8(rp)
74	addi	up, up, 16
75	nop
76	mulld	r0, r26, r6
77	mulhdu	r5, r26, r6
78	mulld	r7, r27, r6
79	mulhdu	r8, r27, r6
80	addc	r7, r7, r5
81	addze	r12, r8
82	addc	r0, r0, r28
83	std	r0, 0(rp)
84	adde	r7, r7, r29
85	std	r7, 8(rp)
86	addi	rp, rp, 16
87	b	L(fic)
88
89L(b01):	bdnz	L(gt1)
90	ld	r26, 0(up)
91	ld	r28, 0(rp)
92	mulld	r0, r26, r6
93	mulhdu	r8, r26, r6
94	addc	r0, r0, r28
95	std	r0, 0(rp)
96	b	L(ret)
97L(gt1):	ld	r26, 0(up)
98	ld	r27, 8(up)
99	mulld	r0, r26, r6
100	mulhdu	r5, r26, r6
101	ld	r26, 16(up)
102	ld	r28, 0(rp)
103	mulld	r7, r27, r6
104	mulhdu	r8, r27, r6
105	ld	r29, 8(rp)
106	ld	r30, 16(rp)
107	mulld	r9, r26, r6
108	mulhdu	r10, r26, r6
109	addc	r7, r7, r5
110	adde	r9, r9, r8
111	addze	r12, r10
112	addc	r0, r0, r28
113	std	r0, 0(rp)
114	adde	r7, r7, r29
115	std	r7, 8(rp)
116	adde	r9, r9, r30
117	std	r9, 16(rp)
118	addi	up, up, 24
119	addi	rp, rp, 24
120	b	L(fic)
121
122L(b10):	addic	r0, r0, 0
123	li	r12, 0		C cy_limb = 0
124L(fic):	ld	r26, 0(up)
125	ld	r27, 8(up)
126	addi	up, up, 16
127	bdz	L(end)
128				C registers dying
129L(top):	mulld	r0, r26, r6	C
130	mulhdu	r5, r26, r6	C 26
131	ld	r26, 0(up)	C
132	ld	r28, 0(rp)	C
133	mulld	r7, r27, r6	C
134	mulhdu	r8, r27, r6	C 27
135	ld	r27, 8(up)	C
136	ld	r29, 8(rp)	C
137	adde	r0, r0, r12	C 0 12
138	adde	r7, r7, r5	C 5 7
139	mulld	r9, r26, r6	C
140	mulhdu	r10, r26, r6	C 26
141	ld	r26, 16(up)	C
142	ld	r30, 16(rp)	C
143	mulld	r11, r27, r6	C
144	mulhdu	r12, r27, r6	C 27
145	ld	r27, 24(up)	C
146	ld	r31, 24(rp)	C
147	adde	r9, r9, r8	C 8 9
148	adde	r11, r11, r10	C 10 11
149	addze	r12, r12	C 12
150	addc	r0, r0, r28	C 0 28
151	std	r0, 0(rp)	C 0
152	adde	r7, r7, r29	C 7 29
153	std	r7, 8(rp)	C 7
154	adde	r9, r9, r30	C 9 30
155	std	r9, 16(rp)	C 9
156	adde	r11, r11, r31	C 11 31
157	std	r11, 24(rp)	C 11
158	addi	up, up, 32	C
159	addi	rp, rp, 32	C
160	bdnz	L(top)		C
161
162L(end):	mulld	r0, r26, r6
163	mulhdu	r5, r26, r6
164	ld	r28, 0(rp)
165	nop
166	mulld	r7, r27, r6
167	mulhdu	r8, r27, r6
168	ld	r29, 8(rp)
169	nop
170	adde	r0, r0, r12
171	adde	r7, r7, r5
172	addze	r8, r8
173	addc	r0, r0, r28
174	std	r0, 0(rp)
175	adde	r7, r7, r29
176	std	r7, 8(rp)
177L(ret):	addze	r3, r8
178	ld	r31, -8(r1)
179	ld	r30, -16(r1)
180	ld	r29, -24(r1)
181	ld	r28, -32(r1)
182	ld	r27, -40(r1)
183	ld	r26, -48(r1)
184	blr
185EPILOGUE()
186