1dnl  PowerPC-32 mpn_addmul_1 -- Multiply a limb vector with a limb and add the
2dnl  result to a second limb vector.
3
4dnl  Copyright 1995, 1997, 1998, 2000, 2001, 2002, 2003, 2005 Free Software
5dnl  Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C                cycles/limb
25C 603e:            ?
26C 604e:            6.75
27C 75x (G3):        8.7-14.3
28C 7400,7410 (G4):  8.7-14.3
29C 744x,745x (G4+): 9.5
30C power4/ppc970:   6.25
31C power5:          6.25
32
33C INPUT PARAMETERS
34C rp	r3
35C up	r4
36C n	r5
37C vl	r6
38
39C This is optimized for the PPC604.  It has not been tuned for other
40C PowerPC processors.
41C
42C Loop Analysis for the 604:
43C 12 mem insn
44C 8 serializing insn
45C 8 int multiply
46C 25 int reg write
47C 9 int ops (8 of which serialize)
48C
49C The multiply insns need 16 cycles/4limb.
50C The integer register writes will need 13 cycles/4limb.
51C All-in-all, it should be possible to get to 4 or 5 cycles/limb on PPC604,
52C but that will require some clever FPNOPS and BNOPS for exact
53C issue control.
54
55
56ASM_START()
57PROLOGUE(mpn_addmul_1)
58	cmpwi	cr0,r5,9	C more than 9 limbs?
59	bgt	cr0,L(big)	C branch if more than 9 limbs
60
61	mtctr	r5
62	lwz	r0,0(r4)
63	mullw	r7,r0,r6
64	mulhwu	r10,r0,r6
65	lwz	r9,0(r3)
66	addc	r8,r7,r9
67	addi	r3,r3,-4
68	bdz	L(end)
69L(loop):
70	lwzu	r0,4(r4)
71	stwu	r8,4(r3)
72	mullw	r8,r0,r6
73	adde	r7,r8,r10
74	mulhwu	r10,r0,r6
75	lwz	r9,4(r3)
76	addze	r10,r10
77	addc	r8,r7,r9
78	bdnz	L(loop)
79L(end):	stw	r8,4(r3)
80	addze	r3,r10
81	blr
82
83L(big):	stmw	r30,-32(r1)
84	addi	r5,r5,-1
85	srwi	r0,r5,2
86	mtctr	r0
87
88	lwz	r7,0(r4)
89	mullw	r8,r7,r6
90	mulhwu	r0,r7,r6
91	lwz	r7,0(r3)
92	addc	r8,r8,r7
93	stw	r8,0(r3)
94
95L(loopU):
96	lwz	r7,4(r4)
97	lwz	r12,8(r4)
98	lwz	r30,12(r4)
99	lwzu	r31,16(r4)
100	mullw	r8,r7,r6
101	mullw	r9,r12,r6
102	mullw	r10,r30,r6
103	mullw	r11,r31,r6
104	adde	r8,r8,r0	C add cy_limb
105	mulhwu	r0,r7,r6
106	lwz	r7,4(r3)
107	adde	r9,r9,r0
108	mulhwu	r0,r12,r6
109	lwz	r12,8(r3)
110	adde	r10,r10,r0
111	mulhwu	r0,r30,r6
112	lwz	r30,12(r3)
113	adde	r11,r11,r0
114	mulhwu	r0,r31,r6
115	lwz	r31,16(r3)
116	addze	r0,r0		C new cy_limb
117	addc	r8,r8,r7
118	stw	r8,4(r3)
119	adde	r9,r9,r12
120	stw	r9,8(r3)
121	adde	r10,r10,r30
122	stw	r10,12(r3)
123	adde	r11,r11,r31
124	stwu	r11,16(r3)
125	bdnz	L(loopU)
126
127	andi.	r31,r5,3
128	mtctr	r31
129	beq	cr0,L(endx)
130
131L(loopE):
132	lwzu	r7,4(r4)
133	mullw	r8,r7,r6
134	adde	r8,r8,r0	C add cy_limb
135	mulhwu	r0,r7,r6
136	lwz	r7,4(r3)
137	addze	r0,r0		C new cy_limb
138	addc	r8,r8,r7
139	stwu	r8,4(r3)
140	bdnz	L(loopE)
141L(endx):
142	addze	r3,r0
143	lmw	r30,-32(r1)
144	blr
145EPILOGUE(mpn_addmul_1)
146