rshift.asm revision 1.1.1.2
1dnl  ARM64 mpn_rshift.
2
3dnl  Copyright 2013, 2014, 2017 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C	     cycles/limb   assumed optimal c/l
23C Cortex-A53	3.5-4.0		 3.25
24C Cortex-A57	 2.0		 2.0
25C X-Gene	 2.67		 2.5
26
27C TODO
28C  * The feed-in code used 1 ldr for odd sized and 2 ldr for even sizes.  These
29C    numbers should be 1 and 0, respectively.  The str in wind-down should also
30C    go.
31C  * Using extr and with 63 separate loops we might reach 1.25 c/l on A57.
32C  * A53's speed depends on alignment, but not as simply as for lshift/lshiftc.
33
34changecom(blah)
35
36define(`rp_arg', `x0')
37define(`up',     `x1')
38define(`n',      `x2')
39define(`cnt',    `x3')
40
41define(`rp',     `x16')
42
43define(`tnc',`x8')
44
45define(`PSHIFT', lsr)
46define(`NSHIFT', lsl)
47
48ASM_START()
49PROLOGUE(mpn_rshift)
50	mov	rp, rp_arg
51	sub	tnc, xzr, cnt
52	lsr	x18, n, #2
53	tbz	n, #0, L(bx0)
54
55L(bx1):	ldr	x5, [up]
56	tbnz	n, #1, L(b11)
57
58L(b01):	NSHIFT	x0, x5, tnc
59	PSHIFT	x2, x5, cnt
60	cbnz	x18, L(gt1)
61	str	x2, [rp]
62	ret
63L(gt1):	ldp	x4, x5, [up,#8]
64	sub	up, up, #8
65	sub	rp, rp, #32
66	b	L(lo2)
67
68L(b11):	NSHIFT	x0, x5, tnc
69	PSHIFT	x2, x5, cnt
70	ldp	x6, x7, [up,#8]!
71	sub	rp, rp, #16
72	b	L(lo3)
73
74L(bx0):	ldp	x4, x5, [up]
75	tbz	n, #1, L(b00)
76
77L(b10):	NSHIFT	x0, x4, tnc
78	PSHIFT	x13, x4, cnt
79	NSHIFT	x10, x5, tnc
80	PSHIFT	x2, x5, cnt
81	cbnz	x18, L(gt2)
82	orr	x10, x10, x13
83	stp	x10, x2, [rp]
84	ret
85L(gt2):	ldp	x4, x5, [up,#16]
86	orr	x10, x10, x13
87	str	x10, [rp],#-24
88	b	L(lo2)
89
90L(b00):	NSHIFT	x0, x4, tnc
91	PSHIFT	x13, x4, cnt
92	NSHIFT	x10, x5, tnc
93	PSHIFT	x2, x5, cnt
94	ldp	x6, x7, [up,#16]!
95	orr	x10, x10, x13
96	str	x10, [rp],#-8
97	b	L(lo0)
98
99	ALIGN(16)
100L(top):	ldp	x4, x5, [up,#16]
101	orr	x10, x10, x13
102	orr	x11, x12, x2
103	stp	x11, x10, [rp,#16]
104	PSHIFT	x2, x7, cnt
105L(lo2):	NSHIFT	x10, x5, tnc
106	NSHIFT	x12, x4, tnc
107	PSHIFT	x13, x4, cnt
108	ldp	x6, x7, [up,#32]!
109	orr	x10, x10, x13
110	orr	x11, x12, x2
111	stp	x11, x10, [rp,#32]!
112	PSHIFT	x2, x5, cnt
113L(lo0):	sub	x18, x18, #1
114L(lo3):	NSHIFT	x10, x7, tnc
115	NSHIFT	x12, x6, tnc
116	PSHIFT	x13, x6, cnt
117	cbnz	x18, L(top)
118
119L(end):	orr	x10, x10, x13
120	orr	x11, x12, x2
121	PSHIFT	x2, x7, cnt
122	stp	x11, x10, [rp,#16]
123	str	x2, [rp,#32]
124	ret
125EPILOGUE()
126