1dnl  ARM mpn_com optimised for A15.
2
3dnl  Contributed to the GNU project by Torbj��rn Granlund.
4
5dnl  Copyright 2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C            cycles/limb
36C StrongARM	 ?
37C XScale	 ?
38C Cortex-A7	 ?
39C Cortex-A8	 ?
40C Cortex-A9	2.5
41C Cortex-A15	1.0
42
43C This is great A15 core register code, but it is a bit large.
44C We use FEEDIN_VARIANT 1 to save some space, but use 8-way unrolling.
45
46C Architecture requirements:
47C v5	-
48C v5t	-
49C v5te	ldrd strd
50C v6	-
51C v6t2	-
52C v7a	-
53
54define(`FEEDIN_VARIANT', 1)	C alternatives: 0 1 2
55define(`UNROLL', 4x2)		C alternatives: 4 4x2
56
57define(`rp', `r0')
58define(`up', `r1')
59define(`n',  `r2')
60
61ASM_START()
62PROLOGUE(mpn_com)
63	push	{ r4-r5,r8-r9 }
64
65ifelse(FEEDIN_VARIANT,0,`
66	ands	r12, n, #3
67	mov	n, n, lsr #2
68	beq	L(b00a)
69	tst	r12, #1
70	beq	L(bx0)
71	ldr	r5, [up], #4
72	mvn	r9, r5
73	str	r9, [rp], #4
74	tst	r12, #2
75	beq	L(b00)
76L(bx0):	ldrd	r4, r5, [up, #0]
77	sub	rp, rp, #8
78	b	L(lo)
79L(b00):	tst	n, n
80	beq	L(wd1)
81L(b00a):ldrd	r4, r5, [up], #-8
82	sub	rp, rp, #16
83	b	L(mid)
84')
85ifelse(FEEDIN_VARIANT,1,`
86	and	r12, n, #3
87	mov	n, n, lsr #2
88	tst	r12, #1
89	beq	L(bx0)
90	ldr	r5, [up], #4
91	mvn	r9, r5
92	str	r9, [rp], #4
93L(bx0):	tst	r12, #2
94	beq	L(b00)
95	ldrd	r4, r5, [up, #0]
96	sub	rp, rp, #8
97	b	L(lo)
98L(b00):	tst	n, n
99	beq	L(wd1)
100	ldrd	r4, r5, [up], #-8
101	sub	rp, rp, #16
102	b	L(mid)
103')
104ifelse(FEEDIN_VARIANT,2,`
105	ands	r12, n, #3
106	mov	n, n, lsr #2
107	beq	L(b00)
108	cmp	r12, #2
109	bcc	L(b01)
110	beq	L(b10)
111
112L(b11):	ldr	r5, [up], #4
113	mvn	r9, r5
114	ldrd	r4, r5, [up, #0]
115	str	r9, [rp], #-4
116	b	L(lo)
117
118L(b00):	ldrd	r4, r5, [up], #-8
119	sub	rp, rp, #16
120	b	L(mid)
121
122L(b01):	ldr	r5, [up], #-4
123	mvn	r9, r5
124	str	r9, [rp], #-12
125	tst	n, n
126	beq	L(wd1)
127L(gt1):	ldrd	r4, r5, [up, #8]
128	b	L(mid)
129
130L(b10):	ldrd	r4, r5, [up]
131	sub	rp, rp, #8
132	b	L(lo)
133')
134	ALIGN(16)
135ifelse(UNROLL,4,`
136L(top):	ldrd	r4, r5, [up, #8]
137	strd	r8, r9, [rp, #8]
138L(mid):	mvn	r8, r4
139	mvn	r9, r5
140	ldrd	r4, r5, [up, #16]!
141	strd	r8, r9, [rp, #16]!
142	sub	n, n, #1
143L(lo):	mvn	r8, r4
144	mvn	r9, r5
145	tst	n, n
146	bne	L(top)
147')
148ifelse(UNROLL,4x2,`
149L(top):	ldrd	r4, r5, [up, #8]
150	strd	r8, r9, [rp, #8]
151L(mid):	mvn	r8, r4
152	mvn	r9, r5
153	ldrd	r4, r5, [up, #16]
154	strd	r8, r9, [rp, #16]
155	mvn	r8, r4
156	mvn	r9, r5
157	sub	n, n, #2
158	tst	n, n
159	bmi	L(dne)
160	ldrd	r4, r5, [up, #24]
161	strd	r8, r9, [rp, #24]
162	mvn	r8, r4
163	mvn	r9, r5
164	ldrd	r4, r5, [up, #32]!
165	strd	r8, r9, [rp, #32]!
166L(lo):	mvn	r8, r4
167	mvn	r9, r5
168	tst	n, n
169	bne	L(top)
170')
171
172L(end):	strd	r8, r9, [rp, #8]
173L(wd1):	pop	{ r4-r5,r8-r9 }
174	bx	r14
175ifelse(UNROLL,4x2,`
176L(dne):	strd	r8, r9, [rp, #24]
177	pop	{ r4-r5,r8-r9 }
178	bx	r14
179')
180EPILOGUE()
181