lshift.asm revision 1.1.1.1
1dnl  S/390-64 mpn_lshift.
2
3dnl  Copyright 2011, 2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C            cycles/limb
23C z900		 7
24C z990           3
25C z9		 ?
26C z10		 ?
27C z196		 ?
28
29C NOTES
30C  * This uses discrete loads and stores in a software pipeline.  Using lmg and
31C    stmg is not faster.
32C  * One could assume more pipelining could approach 2.5 c/l, but we have not
33C    found any 8-way loop that runs better than the current 4-way loop.
34C  * Consider using the same feed-in code for 1 <= n <= 3 as for n mod 4,
35C    similarly to the x86_64 sqr_basecase feed-in.
36
37C INPUT PARAMETERS
38define(`rp',	`%r2')
39define(`up',	`%r3')
40define(`n',	`%r4')
41define(`cnt',	`%r5')
42
43define(`tnc',	`%r6')
44
45ASM_START()
46PROLOGUE(mpn_lshift)
47	cghi	n, 3
48	jh	L(gt1)
49
50	stmg	%r6, %r7, 48(%r15)
51	larl	%r1, L(tab)-4
52	lcgr	tnc, cnt
53	sllg	n, n, 2
54	b	0(n,%r1)
55L(tab):	j	L(n1)
56	j	L(n2)
57	j	L(n3)
58
59L(n1):	lg	%r1, 0(up)
60	sllg	%r0, %r1, 0(cnt)
61	stg	%r0, 0(rp)
62	srlg	%r2, %r1, 0(tnc)
63	lg	%r6, 48(%r15)		C restoring r7 not needed
64	br	%r14
65
66L(n2):	lg	%r1, 8(up)
67	srlg	%r4, %r1, 0(tnc)
68	sllg	%r0, %r1, 0(cnt)
69	j	L(cj)
70
71L(n3):	lg	%r1, 16(up)
72	srlg	%r4, %r1, 0(tnc)
73	sllg	%r0, %r1, 0(cnt)
74	lg	%r1, 8(up)
75	srlg	%r7, %r1, 0(tnc)
76	ogr	%r7, %r0
77	sllg	%r0, %r1, 0(cnt)
78	stg	%r7, 16(rp)
79L(cj):	lg	%r1, 0(up)
80	srlg	%r7, %r1, 0(tnc)
81	ogr	%r7, %r0
82	sllg	%r0, %r1, 0(cnt)
83	stg	%r7, 8(rp)
84	stg	%r0, 0(rp)
85	lgr	%r2, %r4
86	lmg	%r6, %r7, 48(%r15)
87	br	%r14
88
89L(gt1):	stmg	%r6, %r13, 48(%r15)
90	lcgr	tnc, cnt		C tnc = -cnt
91
92	sllg	%r1, n, 3
93	srlg	%r0, n, 2		C loop count
94
95	agr	up, %r1			C point up at end of U
96	agr	rp, %r1			C point rp at end of R
97	aghi	up, -56
98	aghi	rp, -40
99
100	lghi	%r7, 3
101	ngr	%r7, n
102	je	L(b0)
103	cghi	%r7, 2
104	jl	L(b1)
105	je	L(b2)
106
107L(b3):	lg	%r7, 48(up)
108	srlg	%r9, %r7, 0(tnc)
109	sllg	%r11, %r7, 0(cnt)
110	lg	%r8, 40(up)
111	lg	%r7, 32(up)
112	srlg	%r4, %r8, 0(tnc)
113	sllg	%r13, %r8, 0(cnt)
114	ogr	%r11, %r4
115	la	rp, 16(rp)
116	j	L(lm3)
117
118L(b2):	lg	%r8, 48(up)
119	lg	%r7, 40(up)
120	srlg	%r9, %r8, 0(tnc)
121	sllg	%r13, %r8, 0(cnt)
122	la	rp, 24(rp)
123	la	up, 8(up)
124	j	L(lm2)
125
126L(b1):	lg	%r7, 48(up)
127	srlg	%r9, %r7, 0(tnc)
128	sllg	%r11, %r7, 0(cnt)
129	lg	%r8, 40(up)
130	lg	%r7, 32(up)
131	srlg	%r4, %r8, 0(tnc)
132	sllg	%r10, %r8, 0(cnt)
133	ogr	%r11, %r4
134	la	rp, 32(rp)
135	la	up, 16(up)
136	j	L(lm1)
137
138L(b0):	lg	%r8, 48(up)
139	lg	%r7, 40(up)
140	srlg	%r9, %r8, 0(tnc)
141	sllg	%r10, %r8, 0(cnt)
142	la	rp, 40(rp)
143	la	up, 24(up)
144	j	L(lm0)
145
146C	ALIGN(16)
147L(top):	srlg	%r4, %r8, 0(tnc)
148	sllg	%r13, %r8, 0(cnt)
149	ogr	%r11, %r4
150	stg	%r10, 24(rp)
151L(lm3):	stg	%r11, 16(rp)
152L(lm2):	srlg	%r12, %r7, 0(tnc)
153	sllg	%r11, %r7, 0(cnt)
154	lg	%r8, 24(up)
155	lg	%r7, 16(up)
156	ogr	%r13, %r12
157	srlg	%r4, %r8, 0(tnc)
158	sllg	%r10, %r8, 0(cnt)
159	ogr	%r11, %r4
160	stg	%r13, 8(rp)
161L(lm1):	stg	%r11, 0(rp)
162L(lm0):	srlg	%r12, %r7, 0(tnc)
163	aghi	rp, -32
164	sllg	%r11, %r7, 0(cnt)
165	lg	%r8, 8(up)
166	lg	%r7, 0(up)
167	aghi	up, -32
168	ogr	%r10, %r12
169	brctg	%r0, L(top)
170
171L(end):	srlg	%r4, %r8, 0(tnc)
172	sllg	%r13, %r8, 0(cnt)
173	ogr	%r11, %r4
174	stg	%r10, 24(rp)
175	stg	%r11, 16(rp)
176	srlg	%r12, %r7, 0(tnc)
177	sllg	%r11, %r7, 0(cnt)
178	ogr	%r13, %r12
179	stg	%r13, 8(rp)
180	stg	%r11, 0(rp)
181	lgr	%r2, %r9
182
183	lmg	%r6, %r13, 48(%r15)
184	br	%r14
185EPILOGUE()
186