1dnl  POWER9 mpn_addmul_1 and mpn_submul_1.
2
3dnl  Copyright 2018 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C                   mpn_addmul_1    mpn_submul_1
34C                   cycles/limb     cycles/limb
35C POWER3/PPC630		 -		 -
36C POWER4/PPC970		 -		 -
37C POWER5		 -		 -
38C POWER6		 -		 -
39C POWER7		 -		 -
40C POWER8		 -		 -
41C POWER9		 2.63		 2.63
42
43C INPUT PARAMETERS
44define(`rp', `r3')
45define(`up', `r4')
46define(`n',  `r5')
47define(`v0', `r6')
48
49
50ifdef(`OPERATION_addmul_1',`
51  define(`ADDSUBC',	adde)
52  define(`ADDSUB',	addc)
53  define(`func',	mpn_addmul_1)
54  define(`AM',		`$1')
55  define(`SM',		`')
56')
57ifdef(`OPERATION_submul_1',`
58  define(`ADDSUBC',	subfe)
59  define(`ADDSUB',	subfc)
60  define(`func',	mpn_submul_1)
61  define(`AM',		`')
62  define(`SM',		`$1')
63')
64
65MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
66
67ASM_START()
68PROLOGUE(func)
69	cmpdi	cr7, n, 3
70	srdi	r10, n, 2
71	mtctr	r10
72	rldicl.	r9, n, 0, 63
73	ld	r11, 0(up)
74	bne	cr0, L(bx1)
75
76L(bx0):	rldicl. r9, n, 63, 63
77AM(`	subfzeo	r12, n		')	C ov = 0, ca = 0
78AM(`	li	r12, 0		')
79SM(`	subfco	r12, r12, r12	')	C r12 = 0, ov = 0, ca = 1
80	ld	r9, 8(up)
81	mulld	r0, r11, v0
82	mulhdu	r5, r11, v0
83	blt	cr7, L(2)
84	ld	r8, 16(up)
85	bne	cr0, L(b10)
86
87L(b00):	addi	rp, rp, -24
88	b	L(lo0)
89L(b10):	addi	rp, rp, -8
90	addi	up, up, 16
91	b	L(lo2)
92
93L(2):	addi	rp, rp, -8
94	b	L(cj2)
95
96L(bx1):	rldicl. r9, n, 63, 63
97AM(`	subfzeo	r5, n		')	C ov = 0, ca = 0
98AM(`	li	r5, 0		')
99SM(`	subfco	r5, r5, r5	')	C r5 = 0, ov = 0, ca = 1
100	blt	cr7, L(1)
101	ld	r8, 8(up)
102	mulld	r7, r11, v0
103	mulhdu	r12, r11, v0
104	ld	r9, 16(up)
105	bne	cr0, L(b11)
106
107L(b01):	addi	rp, rp, -16
108	addi	up, up, 8
109	b	L(lo1)
110
111L(1):	mulld	r7, r11, v0
112	mulhdu	r12, r11, v0
113	ld	r11, 0(rp)
114	ADDSUB	r10, r7, r11
115	std	r10, 0(rp)
116AM(`	addze	r3, r12		')
117SM(`	subfe	r0, r0, r0	')
118SM(`	sub	r3, r12, r0	')
119	blr
120
121L(b11):	addi	up, up, 24
122	ble	cr7, L(end)
123
124	ALIGN(16)
125L(top):	ld	r11, 0(rp)
126	mulld	r0, r8, v0
127	addex(	r7, r7, r5, 0)
128	mulhdu	r5, r8, v0
129	ld	r8, 0(up)
130	ADDSUBC	r10, r7, r11
131	std	r10, 0(rp)
132L(lo2):	ld	r11, 8(rp)
133	mulld	r7, r9, v0
134	addex(	r0, r0, r12, 0)
135	mulhdu	r12, r9, v0
136	ld	r9, 8(up)
137	ADDSUBC	r10, r0, r11
138	std	r10, 8(rp)
139L(lo1):	ld	r11, 16(rp)
140	mulld	r0, r8, v0
141	addex(	r7, r7, r5, 0)
142	mulhdu	r5, r8, v0
143	ld	r8, 16(up)
144	ADDSUBC	r10, r7, r11
145	std	r10, 16(rp)
146L(lo0):	ld	r11, 24(rp)
147	mulld	r7, r9, v0
148	addex(	r0, r0, r12, 0)
149	mulhdu	r12, r9, v0
150	ld	r9, 24(up)
151	ADDSUBC	r10, r0, r11
152	std	r10, 24(rp)
153	addi	up, up, 32
154	addi	rp, rp, 32
155	bdnz	L(top)
156
157L(end):	ld	r11, 0(rp)
158	mulld	r0, r8, v0
159	addex(	r7, r7, r5, 0)
160	mulhdu	r5, r8, v0
161	ADDSUBC	r10, r7, r11
162	std	r10, 0(rp)
163L(cj2):	ld	r11, 8(rp)
164	mulld	r7, r9, v0
165	addex(	r0, r0, r12, 0)
166	mulhdu	r12, r9, v0
167	ADDSUBC	r10, r0, r11
168	std	r10, 8(rp)
169	ld	r11, 16(rp)
170	addex(	r7, r7, r5, 0)
171	ADDSUBC	r10, r7, r11
172	std	r10, 16(rp)
173	li	r0, 0
174	addex(	r3, r12, r0, 0)
175AM(`	addze	r3, r3		')
176SM(`	subfe	r0, r0, r0	')
177SM(`	sub	r3, r3, r0	')
178	blr
179EPILOGUE()
180