1dnl  ARM mpn_addmul_1 optimised for A15.
2
3dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb		best
34C StrongARM:     -
35C XScale	 ?
36C Cortex-A7	 ?
37C Cortex-A8	 ?
38C Cortex-A9	 6			3.25
39C Cortex-A15	 2			this
40
41C This code uses umlal for adding in the rp[] data, keeping the recurrency path
42C separate from any multiply instructions.  It performs well on A15, at umlal's
43C bandwidth.
44C
45C An A9 variant should perhaps stick to 3-way unrolling, and use ldm and stm
46C for all loads and stores.  Alternatively, it could do 2-way or 4-way, but
47C then alignment aware code will be necessary (adding O(1) bookkeeping
48C overhead).
49C
50C We don't use r12 due to ldrd and strd limitations.
51
52C Architecture requirements:
53C v5	-
54C v5t	-
55C v5te	ldrd strd
56C v6	-
57C v6t2	-
58C v7a	-
59
60define(`rp', `r0')
61define(`up', `r1')
62define(`n',  `r2')
63define(`v0', `r3')
64
65define(`w0', `r10') define(`w1', `r11')
66define(`u0', `r8')  define(`u1', `r9')
67
68ASM_START()
69PROLOGUE(mpn_addmul_1)
70	push	{ r4-r11 }
71
72	ands	r6, n, #3
73	sub	n, n, #3
74	beq	L(b00)
75	cmp	r6, #2
76	bcc	L(b01)
77	beq	L(b10)
78
79L(b11):	mov	r6, #0
80	cmn	r13, #0			C carry clear
81	ldr	u1, [up], #-4
82	ldr	w1, [rp], #-4
83	mov	r7, #0
84	b	L(mid)
85
86L(b00):	ldrd	u0, u1, [up]
87	ldrd	w0, w1, [rp]
88	mov	r6, #0
89	umlal	w0, r6, u0, v0
90	cmn	r13, #0			C carry clear
91	mov	r7, #0
92	str	w0, [rp]
93	b	L(mid)
94
95L(b10):	ldrd	u0, u1, [up], #8
96	ldrd	w0, w1, [rp]
97	mov	r4, #0
98	umlal	w0, r4, u0, v0
99	cmn	r13, #0			C carry clear
100	mov	r5, #0
101	str	w0, [rp], #8
102	umlal	w1, r5, u1, v0
103	tst	n, n
104	bmi	L(end)
105	b	L(top)
106
107L(b01):	mov	r4, #0
108	ldr	u1, [up], #4
109	ldr	w1, [rp], #4
110	mov	r5, #0
111	umlal	w1, r5, u1, v0
112	tst	n, n
113	bmi	L(end)
114
115	ALIGN(16)
116L(top):	ldrd	u0, u1, [up, #0]
117	adcs	r4, r4, w1
118	ldrd	w0, w1, [rp, #0]
119	mov	r6, #0
120	umlal	w0, r6, u0, v0		C 1 2
121	adcs	r5, r5, w0
122	mov	r7, #0
123	strd	r4, r5, [rp, #-4]
124L(mid):	umlal	w1, r7, u1, v0		C 2 3
125	ldrd	u0, u1, [up, #8]
126	adcs	r6, r6, w1
127	ldrd	w0, w1, [rp, #8]
128	mov	r4, #0
129	umlal	w0, r4, u0, v0		C 3 4
130	adcs	r7, r7, w0
131	mov	r5, #0
132	strd	r6, r7, [rp, #4]
133	umlal	w1, r5, u1, v0		C 0 1
134	sub	n, n, #4
135	add	up, up, #16
136	add	rp, rp, #16
137	tst	n, n
138	bpl	L(top)
139
140L(end):	adcs	r4, r4, w1
141	str	r4, [rp, #-4]
142	adc	r0, r5, #0
143	pop	{ r4-r11 }
144	bx	r14
145EPILOGUE()
146