1dnl  ARM v6t2 mpn_divrem_1 and mpn_preinv_divrem_1.
2
3dnl  Contributed to the GNU project by Torbj��rn Granlund.
4
5dnl  Copyright 2012 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C		norm	unorm	frac
36C StrongARM	 -	 -	 -
37C XScale	 -	 -	 -
38C Cortex-A7	 ?	 ?	 ?
39C Cortex-A8	 ?	 ?	 ?
40C Cortex-A9	13	14	13
41C Cortex-A15	11.4	11.8	11.1
42
43C TODO
44C  * Optimise inner-loops better, they could likely run a cycle or two faster.
45C  * Decrease register usage, streamline non-loop code.
46
47define(`qp_arg',  `r0')
48define(`fn',      `r1')
49define(`up_arg',  `r2')
50define(`n_arg',   `r3')
51define(`d_arg',   `0')
52define(`dinv_arg',`4')
53define(`cnt_arg', `8')
54
55define(`n',       `r9')
56define(`qp',      `r5')
57define(`up',      `r6')
58define(`cnt',     `r7')
59define(`tnc',     `r10')
60define(`dinv',    `r0')
61define(`d',       `r4')
62
63ASM_START()
64PROLOGUE(mpn_preinv_divrem_1)
65	stmfd	sp!, {r4, r5, r6, r7, r8, r9, r10, r11, lr}
66	ldr	d,    [sp, #9*4+d_arg]
67	ldr	cnt,  [sp, #9*4+cnt_arg]
68	str	r1, [sp, #9*4+d_arg]	C reuse d stack slot for fn
69	sub	n, r3, #1
70	add	r3, r1, n
71	cmp	d, #0
72	add	qp, qp_arg, r3, lsl #2	C put qp at Q[] end
73	add	up, up_arg, n, lsl #2	C put up at U[] end
74	ldr	dinv, [sp, #9*4+dinv_arg]
75	blt	L(nent)
76	b	L(uent)
77EPILOGUE()
78
79PROLOGUE(mpn_divrem_1)
80	stmfd	sp!, {r4, r5, r6, r7, r8, r9, r10, r11, lr}
81	sub	n, r3, #1
82	ldr	d, [sp, #9*4+d_arg]	C d
83	str	r1, [sp, #9*4+d_arg]	C reuse d stack slot for fn
84	add	r3, r1, n
85	cmp	d, #0
86	add	qp, qp_arg, r3, lsl #2	C put qp at Q[] end
87	add	up, up_arg, n, lsl #2	C put up at U[] end
88	blt	L(normalised)
89
90L(unnorm):
91	clz	cnt, d
92	mov	r0, d, lsl cnt		C pass d << cnt
93	bl	mpn_invert_limb
94L(uent):
95	mov	d, d, lsl cnt		C d <<= cnt
96	cmp	n, #0
97	mov	r1, #0			C r
98	blt	L(frac)
99
100	ldr	r11, [up, #0]
101
102	rsb	tnc, cnt, #32
103	mov	r1, r11, lsr tnc
104	mov	r11, r11, lsl cnt
105	beq	L(uend)
106
107	ldr	r3, [up, #-4]!
108	orr	r2, r11, r3, lsr tnc
109	b	L(mid)
110
111L(utop):
112	mls	r1, d, r8, r11
113	mov	r11, r3, lsl cnt
114	ldr	r3, [up, #-4]!
115	cmp	r1, r2
116	addhi	r1, r1, d
117	subhi	r8, r8, #1
118	orr	r2, r11, r3, lsr tnc
119	cmp	r1, d
120	bcs	L(ufx)
121L(uok):	str	r8, [qp], #-4
122L(mid):	add	r8, r1, #1
123	mov	r11, r2
124	umlal	r2, r8, r1, dinv
125	subs	n, n, #1
126	bne	L(utop)
127
128	mls	r1, d, r8, r11
129	mov	r11, r3, lsl cnt
130	cmp	r1, r2
131	addhi	r1, r1, d
132	subhi	r8, r8, #1
133	cmp	r1, d
134	rsbcs	r1, d, r1
135	addcs	r8, r8, #1
136	str	r8, [qp], #-4
137
138L(uend):add	r8, r1, #1
139	mov	r2, r11
140	umlal	r2, r8, r1, dinv
141	mls	r1, d, r8, r11
142	cmp	r1, r2
143	addhi	r1, r1, d
144	subhi	r8, r8, #1
145	cmp	r1, d
146	rsbcs	r1, d, r1
147	addcs	r8, r8, #1
148	str	r8, [qp], #-4
149L(frac):
150	ldr	r2, [sp, #9*4+d_arg]	C fn
151	cmp	r2, #0
152	beq	L(fend)
153
154L(ftop):mov	r6, #0
155	add	r3, r1, #1
156	umlal	r6, r3, r1, dinv
157	mov	r8, #0
158	mls	r1, d, r3, r8
159	cmp	r1, r6
160	addhi	r1, r1, d
161	subhi	r3, r3, #1
162	subs	r2, r2, #1
163	str	r3, [qp], #-4
164	bne	L(ftop)
165
166L(fend):mov	r11, r1, lsr cnt
167L(rtn):	mov	r0, r11
168	ldmfd	sp!, {r4, r5, r6, r7, r8, r9, r10, r11, pc}
169
170L(normalised):
171	mov	r0, d
172	bl	mpn_invert_limb
173L(nent):
174	cmp	n, #0
175	mov	r11, #0			C r
176	blt	L(nend)
177
178	ldr	r11, [up, #0]
179	cmp	r11, d
180	movlo	r2, #0			C hi q limb
181	movhs	r2, #1			C hi q limb
182	subhs	r11, r11, d
183
184	str	r2, [qp], #-4
185	cmp	n, #0
186	beq	L(nend)
187
188L(ntop):ldr	r1, [up, #-4]!
189	add	r12, r11, #1
190	umlal	r1, r12, r11, dinv
191	ldr	r3, [up, #0]
192	mls	r11, d, r12, r3
193	cmp	r11, r1
194	addhi	r11, r11, d
195	subhi	r12, r12, #1
196	cmp	d, r11
197	bls	L(nfx)
198L(nok):	str	r12, [qp], #-4
199	subs	n, n, #1
200	bne	L(ntop)
201
202L(nend):mov	r1, r11			C r
203	mov	cnt, #0			C shift cnt
204	b	L(frac)
205
206L(nfx):	add	r12, r12, #1
207	rsb	r11, d, r11
208	b	L(nok)
209L(ufx):	rsb	r1, d, r1
210	add	r8, r8, #1
211	b	L(uok)
212EPILOGUE()
213