1dnl  PowerPC-32 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
2dnl  the result from a second limb vector.
3
4dnl  Copyright 1995, 1997, 1998, 2000, 2002, 2005 Free Software Foundation,
5dnl  Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C                cycles/limb
25C 603e:            ?
26C 604e:            7.5
27C 75x (G3):        9.3-15
28C 7400,7410 (G4):  9.3-15
29C 744x,745x (G4+): 10.5
30C power4/ppc970:   6.75
31C power5:          6.5
32
33C INPUT PARAMETERS
34C rp	r3
35C up	r4
36C n	r5
37C vl	r6
38
39C This is optimized for the PPC604.  See addmul_1.asm for additional comments.
40
41ASM_START()
42PROLOGUE(mpn_submul_1)
43	cmpwi	cr0,r5,9	C more than 9 limbs?
44	bgt	cr0,L(big)	C branch if more than 9 limbs
45
46	mtctr	r5
47	lwz	r0,0(r4)
48	mullw	r7,r0,r6
49	mulhwu	r10,r0,r6
50	lwz	r9,0(r3)
51	subfc	r8,r7,r9
52	addc	r7,r7,r8	C invert cy (r7 is junk)
53	addi	r3,r3,-4
54	bdz	L(end)
55L(loop):
56	lwzu	r0,4(r4)
57	stwu	r8,4(r3)
58	mullw	r8,r0,r6
59	adde	r7,r8,r10
60	mulhwu	r10,r0,r6
61	lwz	r9,4(r3)
62	addze	r10,r10
63	subfc	r8,r7,r9
64	addc	r7,r7,r8	C invert cy (r7 is junk)
65	bdnz	L(loop)
66L(end):	stw	r8,4(r3)
67	addze	r3,r10
68	blr
69
70L(big):	stmw	r30,-32(r1)
71	addi	r5,r5,-1
72	srwi	r0,r5,2
73	mtctr	r0
74
75	lwz	r7,0(r4)
76	mullw	r8,r7,r6
77	mulhwu	r0,r7,r6
78	lwz	r7,0(r3)
79	subfc	r7,r8,r7
80	addc	r8,r8,r7
81	stw	r7,0(r3)
82
83L(loopU):
84	lwz	r7,4(r4)
85	lwz	r12,8(r4)
86	lwz	r30,12(r4)
87	lwzu	r31,16(r4)
88	mullw	r8,r7,r6
89	mullw	r9,r12,r6
90	mullw	r10,r30,r6
91	mullw	r11,r31,r6
92	adde	r8,r8,r0	C add cy_limb
93	mulhwu	r0,r7,r6
94	lwz	r7,4(r3)
95	adde	r9,r9,r0
96	mulhwu	r0,r12,r6
97	lwz	r12,8(r3)
98	adde	r10,r10,r0
99	mulhwu	r0,r30,r6
100	lwz	r30,12(r3)
101	adde	r11,r11,r0
102	mulhwu	r0,r31,r6
103	lwz	r31,16(r3)
104	addze	r0,r0		C new cy_limb
105	subfc	r7,r8,r7
106	stw	r7,4(r3)
107	subfe	r12,r9,r12
108	stw	r12,8(r3)
109	subfe	r30,r10,r30
110	stw	r30,12(r3)
111	subfe	r31,r11,r31
112	stwu	r31,16(r3)
113	subfe	r11,r11,r11	C invert ...
114	addic	r11,r11,1	C ... carry
115	bdnz	L(loopU)
116
117	andi.	r31,r5,3
118	mtctr	r31
119	beq	cr0,L(endx)
120
121L(loopE):
122	lwzu	r7,4(r4)
123	mullw	r8,r7,r6
124	adde	r8,r8,r0	C add cy_limb
125	mulhwu	r0,r7,r6
126	lwz	r7,4(r3)
127	addze	r0,r0		C new cy_limb
128	subfc	r7,r8,r7
129	addc	r8,r8,r7
130	stwu	r7,4(r3)
131	bdnz	L(loopE)
132L(endx):
133	addze	r3,r0
134	lmw	r30,-32(r1)
135	blr
136EPILOGUE(mpn_submul_1)
137