1dnl  ARM64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
2
3dnl  Copyright 2012-2014 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb
34C Cortex-A53	 2
35C Cortex-A57	 1
36C X-Gene	 1.45
37
38define(`ap',	x0)
39define(`n',	x1)
40
41changecom(blah)
42
43C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
44
45C TODO
46C  * An alternative inner loop which could run at 0.722 c/l on A57:
47C	adds	x8, x8, x2
48C	adcs	x9, x9, x3
49C	ldp	x2, x3, [ap, #-32]
50C	adcs	x10, x10, x4
51C	adc	x12, x12, xzr
52C	adds	x8, x8, x5
53C	ldp	x4, x5, [ap, #-16]
54C	sub	n, n, #6
55C	adcs	x9, x9, x6
56C	adcs	x10, x10, x7
57C	ldp	x6, x7, [ap], #48
58C	adc	x12, x12, xzr
59C	tbz	n, #63, L(top)
60
61ASM_START()
62	TEXT
63	ALIGN(32)
64PROLOGUE(mpn_mod_34lsub1)
65	subs	n, n, #3
66	mov	x8, #0
67	b.lt	L(le2)			C n <= 2
68
69	ldp	x2, x3, [ap, #0]
70	ldr	x4, [ap, #16]
71	add	ap, ap, #24
72	subs	n, n, #3
73	b.lt	L(sum)			C n <= 5
74	cmn	x0, #0			C clear carry
75
76L(top):	ldp	x5, x6, [ap, #0]
77	ldr	x7, [ap, #16]
78	add	ap, ap, #24
79	sub	n, n, #3
80	adcs	x2, x2, x5
81	adcs	x3, x3, x6
82	adcs	x4, x4, x7
83	tbz	n, #63, L(top)
84
85	adc	x8, xzr, xzr		C x8 <= 1
86
87L(sum):	cmn	n, #2
88	mov	x5, #0
89	b.lo	1f
90	ldr	x5, [ap], #8
911:	mov	x6, #0
92	b.ls	1f
93	ldr	x6, [ap], #8
941:	adds	x2, x2, x5
95	adcs	x3, x3, x6
96	adcs	x4, x4, xzr
97	adc	x8, x8, xzr		C x8 <= 2
98
99L(sum2):
100	and	x0, x2, #0xffffffffffff
101	add	x0, x0, x2, lsr #48
102	add	x0, x0, x8
103
104	lsl	x8, x3, #16
105	and	x1, x8, #0xffffffffffff
106	add	x0, x0, x1
107	add	x0, x0, x3, lsr #32
108
109	lsl	x8, x4, #32
110	and	x1, x8, #0xffffffffffff
111	add	x0, x0, x1
112	add	x0, x0, x4, lsr #16
113	ret
114
115L(le2):	cmn	n, #1
116	b.ne	L(1)
117	ldp	x2, x3, [ap]
118	mov	x4, #0
119	b	L(sum2)
120L(1):	ldr	x2, [ap]
121	and	x0, x2, #0xffffffffffff
122	add	x0, x0, x2, lsr #48
123	ret
124EPILOGUE()
125