1dnl  ARM mpn_add_n/mpn_sub_n optimised for A15.
2
3dnl  Copyright 2013 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb		best
34C StrongARM:     -
35C XScale	 ?
36C Cortex-A7	 ?
37C Cortex-A8	 ?
38C Cortex-A9	 3.55			2.5
39C Cortex-A15	 1.27			this
40
41C This was a major improvement compared to the code we had before, but it might
42C not be the best 8-way code possible.  We've tried some permutations of auto-
43C increments and separate pointer updates, but they all ran at the same speed
44C on A15.
45
46C Architecture requirements:
47C v5	-
48C v5t	-
49C v5te	ldrd strd
50C v6	-
51C v6t2	-
52C v7a	-
53
54define(`rp', `r0')
55define(`up', `r1')
56define(`vp', `r2')
57define(`n',  `r3')
58
59ifdef(`OPERATION_add_n', `
60  define(`ADDSUBC',	adcs)
61  define(`IFADD',	`$1')
62  define(`SETCY',	`cmp	$1, #1')
63  define(`RETVAL',	`adc	r0, n, #0')
64  define(`RETVAL2',	`adc	r0, n, #1')
65  define(`func',	mpn_add_n)
66  define(`func_nc',	mpn_add_nc)')
67ifdef(`OPERATION_sub_n', `
68  define(`ADDSUBC',	sbcs)
69  define(`IFADD',	`')
70  define(`SETCY',	`rsbs	$1, $1, #0')
71  define(`RETVAL',	`sbc	r0, r0, r0
72			and	r0, r0, #1')
73  define(`RETVAL2',	`RETVAL')
74  define(`func',	mpn_sub_n)
75  define(`func_nc',	mpn_sub_nc)')
76
77MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
78
79ASM_START()
80PROLOGUE(func_nc)
81	ldr	r12, [sp]
82	b	L(ent)
83EPILOGUE()
84PROLOGUE(func)
85	mov	r12, #0
86L(ent):	push	{ r4-r9 }
87
88	ands	r6, n, #3
89	mov	n, n, lsr #2
90	beq	L(b00)
91	cmp	r6, #2
92	bcc	L(b01)
93	beq	L(b10)
94
95L(b11):	ldr	r5, [up], #4
96	ldr	r7, [vp], #4
97	SETCY(	r12)
98	ADDSUBC	r9, r5, r7
99	ldrd	r4, r5, [up, #0]
100	ldrd	r6, r7, [vp, #0]
101	str	r9, [rp], #-4
102	b	L(lo)
103
104L(b00):	ldrd	r4, r5, [up], #-8
105	ldrd	r6, r7, [vp], #-8
106	SETCY(	r12)
107	sub	rp, rp, #16
108	b	L(mid)
109
110L(b01):	ldr	r5, [up], #-4
111	ldr	r7, [vp], #-4
112	SETCY(	r12)
113	ADDSUBC	r9, r5, r7
114	str	r9, [rp], #-12
115	tst	n, n
116	beq	L(wd1)
117L(gt1):	ldrd	r4, r5, [up, #8]
118	ldrd	r6, r7, [vp, #8]
119	b	L(mid)
120
121L(b10):	ldrd	r4, r5, [up]
122	ldrd	r6, r7, [vp]
123	SETCY(	r12)
124	sub	rp, rp, #8
125	b	L(lo)
126
127	ALIGN(16)
128L(top):	ldrd	r4, r5, [up, #8]
129	ldrd	r6, r7, [vp, #8]
130	strd	r8, r9, [rp, #8]
131L(mid):	ADDSUBC	r8, r4, r6
132	ADDSUBC	r9, r5, r7
133	ldrd	r4, r5, [up, #16]
134	ldrd	r6, r7, [vp, #16]
135	strd	r8, r9, [rp, #16]
136	ADDSUBC	r8, r4, r6
137	ADDSUBC	r9, r5, r7
138	sub	n, n, #2
139	tst	n, n
140	bmi	L(dne)
141	ldrd	r4, r5, [up, #24]
142	ldrd	r6, r7, [vp, #24]
143	strd	r8, r9, [rp, #24]
144	ADDSUBC	r8, r4, r6
145	ADDSUBC	r9, r5, r7
146	ldrd	r4, r5, [up, #32]!
147	ldrd	r6, r7, [vp, #32]!
148	strd	r8, r9, [rp, #32]!
149L(lo):	ADDSUBC	r8, r4, r6
150	ADDSUBC	r9, r5, r7
151	tst	n, n
152	bne	L(top)
153
154L(end):	strd	r8, r9, [rp, #8]
155L(wd1):	RETVAL
156	pop	{ r4-r9 }
157	bx	r14
158L(dne):	strd	r8, r9, [rp, #24]
159	RETVAL2
160	pop	{ r4-r9 }
161	bx	r14
162EPILOGUE()
163