1dnl  PowerPC-32 mpn_addmul_1 -- Multiply a limb vector with a limb and add the
2dnl  result to a second limb vector.
3
4dnl  Copyright 1995, 1997, 1998, 2000-2003, 2005 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34C                cycles/limb
35C 603e:            ?
36C 604e:            6.75
37C 75x (G3):        8.7-14.3
38C 7400,7410 (G4):  8.7-14.3
39C 744x,745x (G4+): 9.5
40C power4/ppc970:   6.25
41C power5:          6.25
42
43C INPUT PARAMETERS
44C rp	r3
45C up	r4
46C n	r5
47C vl	r6
48
49C This is optimized for the PPC604.  It has not been tuned for other
50C PowerPC processors.
51C
52C Loop Analysis for the 604:
53C 12 mem insn
54C 8 serializing insn
55C 8 int multiply
56C 25 int reg write
57C 9 int ops (8 of which serialize)
58C
59C The multiply insns need 16 cycles/4limb.
60C The integer register writes will need 13 cycles/4limb.
61C All-in-all, it should be possible to get to 4 or 5 cycles/limb on PPC604,
62C but that will require some clever FPNOPS and BNOPS for exact
63C issue control.
64
65
66ASM_START()
67PROLOGUE(mpn_addmul_1)
68	cmpwi	cr0,r5,9	C more than 9 limbs?
69	bgt	cr0,L(big)	C branch if more than 9 limbs
70
71	mtctr	r5
72	lwz	r0,0(r4)
73	mullw	r7,r0,r6
74	mulhwu	r10,r0,r6
75	lwz	r9,0(r3)
76	addc	r8,r7,r9
77	addi	r3,r3,-4
78	bdz	L(end)
79L(loop):
80	lwzu	r0,4(r4)
81	stwu	r8,4(r3)
82	mullw	r8,r0,r6
83	adde	r7,r8,r10
84	mulhwu	r10,r0,r6
85	lwz	r9,4(r3)
86	addze	r10,r10
87	addc	r8,r7,r9
88	bdnz	L(loop)
89L(end):	stw	r8,4(r3)
90	addze	r3,r10
91	blr
92
93L(big):	stwu	r1,-16(r1)
94	addi	r5,r5,-1
95	stw	r30,8(r1)
96	srwi	r0,r5,2
97	stw	r31,12(r1)
98	mtctr	r0
99
100	lwz	r7,0(r4)
101	mullw	r8,r7,r6
102	mulhwu	r0,r7,r6
103	lwz	r7,0(r3)
104	addc	r8,r8,r7
105	stw	r8,0(r3)
106
107L(loopU):
108	lwz	r7,4(r4)
109	lwz	r12,8(r4)
110	lwz	r30,12(r4)
111	lwzu	r31,16(r4)
112	mullw	r8,r7,r6
113	mullw	r9,r12,r6
114	mullw	r10,r30,r6
115	mullw	r11,r31,r6
116	adde	r8,r8,r0	C add cy_limb
117	mulhwu	r0,r7,r6
118	lwz	r7,4(r3)
119	adde	r9,r9,r0
120	mulhwu	r0,r12,r6
121	lwz	r12,8(r3)
122	adde	r10,r10,r0
123	mulhwu	r0,r30,r6
124	lwz	r30,12(r3)
125	adde	r11,r11,r0
126	mulhwu	r0,r31,r6
127	lwz	r31,16(r3)
128	addze	r0,r0		C new cy_limb
129	addc	r8,r8,r7
130	stw	r8,4(r3)
131	adde	r9,r9,r12
132	stw	r9,8(r3)
133	adde	r10,r10,r30
134	stw	r10,12(r3)
135	adde	r11,r11,r31
136	stwu	r11,16(r3)
137	bdnz	L(loopU)
138
139	andi.	r31,r5,3
140	mtctr	r31
141	beq	cr0,L(endx)
142
143L(loopE):
144	lwzu	r7,4(r4)
145	mullw	r8,r7,r6
146	adde	r8,r8,r0	C add cy_limb
147	mulhwu	r0,r7,r6
148	lwz	r7,4(r3)
149	addze	r0,r0		C new cy_limb
150	addc	r8,r8,r7
151	stwu	r8,4(r3)
152	bdnz	L(loopE)
153L(endx):
154	addze	r3,r0
155	lwz	r30,8(r1)
156	lwz	r31,12(r1)
157	addi	r1,r1,16
158	blr
159EPILOGUE(mpn_addmul_1)
160