1dnl  ARM64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor.
2
3dnl  Contributed to the GNU project by Torbj��rn Granlund.
4
5dnl  Copyright 2012, 2017 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C               cycles/limb
36C               norm   unorm
37C Cortex-A53	12	15
38C Cortex-A57	12	12
39C Cortex-A72
40C Cortex-A73
41C X-Gene	11	11
42
43C TODO
44C  * Scheduling of umulh later in the unorm loop brings A53 time to 12 c/l.
45C    Unfortunately, that requires software pipelining.
46
47define(`rp',  `x0')
48define(`up',  `x1')
49define(`n',   `x2')
50define(`d',   `x3')
51define(`di',  `x4')		C	just mpn_pi1_bdiv_q_1
52define(`cnt', `x5')		C	just mpn_pi1_bdiv_q_1
53
54define(`cy',  `r7')
55define(`tnc', `x8')
56
57ASM_START()
58PROLOGUE(mpn_bdiv_q_1)
59
60	rbit	x6, d
61	clz	cnt, x6
62	lsr	d, d, cnt
63
64	LEA_HI(	x7, binvert_limb_table)
65	ubfx	x6, d, 1, 7
66	LEA_LO(	x7, binvert_limb_table)
67	ldrb	w6, [x7, x6]
68	ubfiz	x7, x6, 1, 8
69	umull	x6, w6, w6
70	msub	x6, x6, d, x7
71	lsl	x7, x6, 1
72	mul	x6, x6, x6
73	msub	x6, x6, d, x7
74	lsl	x7, x6, 1
75	mul	x6, x6, x6
76	msub	di, x6, d, x7
77
78	b	GSYM_PREFIX`'mpn_pi1_bdiv_q_1
79EPILOGUE()
80
81PROLOGUE(mpn_pi1_bdiv_q_1)
82	sub	n, n, #1
83	subs	x6, x6, x6		C clear r6 and C flag
84	ldr	x9, [up],#8
85	cbz	cnt, L(norm)
86
87L(unorm):
88	lsr	x12, x9, cnt
89	cbz	n, L(eu1)
90	sub	tnc, xzr, cnt
91
92L(tpu):	ldr	x9, [up],#8
93	lsl	x7, x9, tnc
94	orr	x7, x7, x12
95	sbcs	x6, x7, x6
96	mul	x7, x6, di
97	str	x7, [rp],#8
98	lsr	x12, x9, cnt
99	umulh	x6, x7, d
100	sub	n, n, #1
101	cbnz	n, L(tpu)
102
103L(eu1):	sbcs	x6, x12, x6
104	mul	x6, x6, di
105	str	x6, [rp]
106	ret
107
108L(norm):
109	mul	x5, x9, di
110	str	x5, [rp],#8
111	cbz	n, L(en1)
112
113L(tpn):	ldr	x9, [up],#8
114	umulh	x5, x5, d
115	sbcs	x5, x9, x5
116	mul	x5, x5, di
117	str	x5, [rp],#8
118	sub	n, n, #1
119	cbnz	n, L(tpn)
120
121L(en1):	ret
122EPILOGUE()
123