1dnl  S/390-64 mpn_lshift.
2
3dnl  Copyright 2011, 2012, 2014 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C            cycles/limb
34C z900		 7
35C z990           3
36C z9		 ?
37C z10		 6
38C z196		 ?
39
40C NOTES
41C  * This uses discrete loads and stores in a software pipeline.  Using lmg and
42C    stmg is not faster.
43C  * One could assume more pipelining could approach 2.5 c/l, but we have not
44C    found any 8-way loop that runs better than the current 4-way loop.
45C  * Consider using the same feed-in code for 1 <= n <= 3 as for n mod 4,
46C    similarly to the x86_64 sqr_basecase feed-in.
47
48C INPUT PARAMETERS
49define(`rp',	`%r2')
50define(`up',	`%r3')
51define(`n',	`%r4')
52define(`cnt',	`%r5')
53
54define(`tnc',	`%r6')
55
56ASM_START()
57PROLOGUE(mpn_lshift)
58	cghi	n, 3
59	jh	L(gt1)
60
61	stmg	%r6, %r7, 48(%r15)
62	larl	%r1, L(tab)-4
63	lcgr	tnc, cnt
64	sllg	n, n, 2
65	b	0(n,%r1)
66L(tab):	j	L(n1)
67	j	L(n2)
68	j	L(n3)
69
70L(n1):	lg	%r1, 0(up)
71	sllg	%r0, %r1, 0(cnt)
72	stg	%r0, 0(rp)
73	srlg	%r2, %r1, 0(tnc)
74	lg	%r6, 48(%r15)		C restoring r7 not needed
75	br	%r14
76
77L(n2):	lg	%r1, 8(up)
78	srlg	%r4, %r1, 0(tnc)
79	sllg	%r0, %r1, 0(cnt)
80	j	L(cj)
81
82L(n3):	lg	%r1, 16(up)
83	srlg	%r4, %r1, 0(tnc)
84	sllg	%r0, %r1, 0(cnt)
85	lg	%r1, 8(up)
86	srlg	%r7, %r1, 0(tnc)
87	ogr	%r7, %r0
88	sllg	%r0, %r1, 0(cnt)
89	stg	%r7, 16(rp)
90L(cj):	lg	%r1, 0(up)
91	srlg	%r7, %r1, 0(tnc)
92	ogr	%r7, %r0
93	sllg	%r0, %r1, 0(cnt)
94	stg	%r7, 8(rp)
95	stg	%r0, 0(rp)
96	lgr	%r2, %r4
97	lmg	%r6, %r7, 48(%r15)
98	br	%r14
99
100L(gt1):	stmg	%r6, %r13, 48(%r15)
101	lcgr	tnc, cnt		C tnc = -cnt
102
103	sllg	%r1, n, 3
104	srlg	%r0, n, 2		C loop count
105
106	agr	up, %r1			C point up at end of U
107	agr	rp, %r1			C point rp at end of R
108	aghi	up, -56
109	aghi	rp, -40
110
111	lghi	%r7, 3
112	ngr	%r7, n
113	je	L(b0)
114	cghi	%r7, 2
115	jl	L(b1)
116	je	L(b2)
117
118L(b3):	lg	%r7, 48(up)
119	srlg	%r9, %r7, 0(tnc)
120	sllg	%r11, %r7, 0(cnt)
121	lg	%r8, 40(up)
122	lg	%r7, 32(up)
123	srlg	%r4, %r8, 0(tnc)
124	sllg	%r13, %r8, 0(cnt)
125	ogr	%r11, %r4
126	la	rp, 16(rp)
127	j	L(lm3)
128
129L(b2):	lg	%r8, 48(up)
130	lg	%r7, 40(up)
131	srlg	%r9, %r8, 0(tnc)
132	sllg	%r13, %r8, 0(cnt)
133	la	rp, 24(rp)
134	la	up, 8(up)
135	j	L(lm2)
136
137L(b1):	lg	%r7, 48(up)
138	srlg	%r9, %r7, 0(tnc)
139	sllg	%r11, %r7, 0(cnt)
140	lg	%r8, 40(up)
141	lg	%r7, 32(up)
142	srlg	%r4, %r8, 0(tnc)
143	sllg	%r10, %r8, 0(cnt)
144	ogr	%r11, %r4
145	la	rp, 32(rp)
146	la	up, 16(up)
147	j	L(lm1)
148
149L(b0):	lg	%r8, 48(up)
150	lg	%r7, 40(up)
151	srlg	%r9, %r8, 0(tnc)
152	sllg	%r10, %r8, 0(cnt)
153	la	rp, 40(rp)
154	la	up, 24(up)
155	j	L(lm0)
156
157	ALIGN(8)
158L(top):	srlg	%r4, %r8, 0(tnc)
159	sllg	%r13, %r8, 0(cnt)
160	ogr	%r11, %r4
161	stg	%r10, 24(rp)
162L(lm3):	stg	%r11, 16(rp)
163L(lm2):	srlg	%r12, %r7, 0(tnc)
164	sllg	%r11, %r7, 0(cnt)
165	lg	%r8, 24(up)
166	lg	%r7, 16(up)
167	ogr	%r13, %r12
168	srlg	%r4, %r8, 0(tnc)
169	sllg	%r10, %r8, 0(cnt)
170	ogr	%r11, %r4
171	stg	%r13, 8(rp)
172L(lm1):	stg	%r11, 0(rp)
173L(lm0):	srlg	%r12, %r7, 0(tnc)
174	aghi	rp, -32
175	sllg	%r11, %r7, 0(cnt)
176	lg	%r8, 8(up)
177	lg	%r7, 0(up)
178	aghi	up, -32
179	ogr	%r10, %r12
180	brctg	%r0, L(top)
181
182L(end):	srlg	%r4, %r8, 0(tnc)
183	sllg	%r13, %r8, 0(cnt)
184	ogr	%r11, %r4
185	stg	%r10, 24(rp)
186	stg	%r11, 16(rp)
187	srlg	%r12, %r7, 0(tnc)
188	sllg	%r11, %r7, 0(cnt)
189	ogr	%r13, %r12
190	stg	%r13, 8(rp)
191	stg	%r11, 0(rp)
192	lgr	%r2, %r9
193
194	lmg	%r6, %r13, 48(%r15)
195	br	%r14
196EPILOGUE()
197