mod_34lsub1.asm revision 1.1.1.1
1dnl  ARM64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
2
3dnl  Copyright 2012-2014 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb
34C Cortex-A53	 ?
35C Cortex-A57	 ?
36
37define(`ap',	x0)
38define(`n',	x1)
39
40changecom(@&*$)
41
42C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
43
44C TODO
45C  * An alternative inner loop which could run at 0.722 c/l:
46C	adds	x8, x8, x2
47C	adcs	x9, x9, x3
48C	ldp	x2, x3, [ap, #-32]
49C	adcs	x10, x10, x4
50C	adc	x12, x12, xzr
51C	adds	x8, x8, x5
52C	ldp	x4, x5, [ap, #-16]
53C	sub	n, n, #6
54C	adcs	x9, x9, x6
55C	adcs	x10, x10, x7
56C	ldp	x6, x7, [ap], #48
57C	adc	x12, x12, xzr
58C	tbz	n, #63, L(top)
59
60ASM_START()
61	TEXT
62	ALIGN(32)
63PROLOGUE(mpn_mod_34lsub1)
64	subs	n, n, #3
65	mov	x8, #0
66	b.lt	L(le2)			C n <= 2
67
68	ldp	x2, x3, [ap, #0]
69	ldr	x4, [ap, #16]
70	add	ap, ap, #24
71	subs	n, n, #3
72	b.lt	L(sum)			C n <= 5
73	cmn	x0, #0			C clear carry
74
75L(top):	ldp	x5, x6, [ap, #0]
76	ldr	x7, [ap, #16]
77	add	ap, ap, #24
78	sub	n, n, #3
79	adcs	x2, x2, x5
80	adcs	x3, x3, x6
81	adcs	x4, x4, x7
82	tbz	n, #63, L(top)
83
84	adc	x8, xzr, xzr		C x8 <= 1
85
86L(sum):	cmn	n, #2
87	mov	x5, #0
88	b.lo	1f
89	ldr	x5, [ap], #8
901:	mov	x6, #0
91	b.ls	1f
92	ldr	x6, [ap], #8
931:	adds	x2, x2, x5
94	adcs	x3, x3, x6
95	adcs	x4, x4, xzr
96	adc	x8, x8, xzr		C x8 <= 2
97
98L(sum2):
99	and	x0, x2, #0xffffffffffff
100	add	x0, x0, x2, lsr #48
101	add	x0, x0, x8
102
103	lsl	x8, x3, #16
104	and	x1, x8, #0xffffffffffff
105	add	x0, x0, x1
106	add	x0, x0, x3, lsr #32
107
108	lsl	x8, x4, #32
109	and	x1, x8, #0xffffffffffff
110	add	x0, x0, x1
111	add	x0, x0, x4, lsr #16
112	ret
113
114L(le2):	cmn	n, #1
115	b.ne	L(1)
116	ldp	x2, x3, [ap]
117	mov	x4, #0
118	b	L(sum2)
119L(1):	ldr	x2, [ap]
120	and	x0, x2, #0xffffffffffff
121	add	x0, x0, x2, lsr #48
122	ret
123EPILOGUE()
124