1dnl  AMD K6-2 mpn_lshift -- mpn left shift.
2
3dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C K6-2: 1.75 cycles/limb
24
25
26C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
27C                       unsigned shift);
28C
29
30defframe(PARAM_SHIFT,16)
31defframe(PARAM_SIZE, 12)
32defframe(PARAM_SRC,  8)
33defframe(PARAM_DST,  4)
34deflit(`FRAME',0)
35
36dnl  used after src has been fetched
37define(VAR_RETVAL,`PARAM_SRC')
38
39dnl  minimum 9, because unrolled loop can't handle less
40deflit(UNROLL_THRESHOLD, 9)
41
42	TEXT
43	ALIGN(32)
44
45PROLOGUE(mpn_lshift)
46deflit(`FRAME',0)
47
48	C The 1 limb case can be done without the push %ebx, but it's then
49	C still the same speed.  The push is left as a free helping hand for
50	C the two_or_more code.
51
52	movl	PARAM_SIZE, %eax
53	pushl	%ebx			FRAME_pushl()
54
55	movl	PARAM_SRC, %ebx
56	decl	%eax
57
58	movl	PARAM_SHIFT, %ecx
59	jnz	L(two_or_more)
60
61	movl	(%ebx), %edx		C src limb
62	movl	PARAM_DST, %ebx
63
64	shldl(	%cl, %edx, %eax)	C return value
65
66	shll	%cl, %edx
67
68	movl	%edx, (%ebx)		C dst limb
69	popl	%ebx
70
71	ret
72
73
74C -----------------------------------------------------------------------------
75	ALIGN(16)	C avoid offset 0x1f
76L(two_or_more):
77	C eax	size-1
78	C ebx	src
79	C ecx	shift
80	C edx
81
82	movl	(%ebx,%eax,4), %edx	C src high limb
83	negl	%ecx
84
85	movd	PARAM_SHIFT, %mm6
86	addl	$32, %ecx		C 32-shift
87
88	shrl	%cl, %edx
89	cmpl	$UNROLL_THRESHOLD-1, %eax
90
91	movl	%edx, VAR_RETVAL
92	jae	L(unroll)
93
94
95	movd	%ecx, %mm7
96	movl	%eax, %ecx
97
98	movl	PARAM_DST, %eax
99
100L(simple):
101	C eax	dst
102	C ebx	src
103	C ecx	counter, size-1 to 1
104	C edx	retval
105	C
106	C mm0	scratch
107	C mm6	shift
108	C mm7	32-shift
109
110	movq	-4(%ebx,%ecx,4), %mm0
111
112	psrlq	%mm7, %mm0
113
114Zdisp(	movd,	%mm0, 0,(%eax,%ecx,4))
115	loop	L(simple)
116
117
118	movd	(%ebx), %mm0
119	popl	%ebx
120
121	psllq	%mm6, %mm0
122
123	movd	%mm0, (%eax)
124	movl	%edx, %eax
125
126	femms
127	ret
128
129
130C -----------------------------------------------------------------------------
131	ALIGN(16)
132L(unroll):
133	C eax	size-1
134	C ebx	src
135	C ecx	32-shift
136	C edx	retval (but instead VAR_RETVAL is used)
137	C
138	C mm6	shift
139
140	addl	$32, %ecx
141	movl	PARAM_DST, %edx
142
143	movd	%ecx, %mm7
144	subl	$7, %eax			C size-8
145
146	leal	(%edx,%eax,4), %ecx		C alignment of dst
147
148	movq	32-8(%ebx,%eax,4), %mm2		C src high qword
149	testb	$4, %cl
150
151	jz	L(dst_aligned)
152	psllq	%mm6, %mm2
153
154	psrlq	$32, %mm2
155	decl	%eax
156
157	movd	%mm2, 32(%edx,%eax,4)		C dst high limb
158	movq	32-8(%ebx,%eax,4), %mm2		C new src high qword
159L(dst_aligned):
160
161	movq	32-16(%ebx,%eax,4), %mm0	C src second highest qword
162
163
164	C This loop is the important bit, the rest is just support for it.
165	C Four src limbs are held at the start, and four more will be read.
166	C Four dst limbs will be written.  This schedule seems necessary for
167	C full speed.
168	C
169	C The use of size-8 lets the loop stop when %eax goes negative and
170	C leaves -4 to -1 which can be tested with test $1 and $2.
171
172L(top):
173	C eax	counter, size-8 step by -4 until <0
174	C ebx	src
175	C ecx
176	C edx	dst
177	C
178	C mm0	src next qword
179	C mm1	scratch
180	C mm2	src prev qword
181	C mm6	shift
182	C mm7	64-shift
183
184	psllq	%mm6, %mm2
185	subl	$4, %eax
186
187	movq	%mm0, %mm1
188	psrlq	%mm7, %mm0
189
190	por	%mm0, %mm2
191	movq	24(%ebx,%eax,4), %mm0
192
193	psllq	%mm6, %mm1
194	movq	%mm2, 40(%edx,%eax,4)
195
196	movq	%mm0, %mm2
197	psrlq	%mm7, %mm0
198
199	por	%mm0, %mm1
200	movq	16(%ebx,%eax,4), %mm0
201
202	movq	%mm1, 32(%edx,%eax,4)
203	jnc	L(top)
204
205
206	C Now have four limbs in mm2 (prev) and mm0 (next), plus eax mod 4.
207	C
208	C 8(%ebx) is the next source, and 24(%edx) is the next destination.
209	C %eax is between -4 and -1, representing respectively 0 to 3 extra
210	C limbs that must be read.
211
212
213	testl	$2, %eax	C testl to avoid bad cache line crossing
214	jz	L(finish_nottwo)
215
216	C Two more limbs: lshift mm2, OR it with rshifted mm0, mm0 becomes
217	C new mm2 and a new mm0 is loaded.
218
219	psllq	%mm6, %mm2
220	movq	%mm0, %mm1
221
222	psrlq	%mm7, %mm0
223	subl	$2, %eax
224
225	por	%mm0, %mm2
226	movq	16(%ebx,%eax,4), %mm0
227
228	movq	%mm2, 32(%edx,%eax,4)
229	movq	%mm1, %mm2
230L(finish_nottwo):
231
232
233	C lshift mm2, OR with rshifted mm0, mm1 becomes lshifted mm0
234
235	testb	$1, %al
236	psllq	%mm6, %mm2
237
238	movq	%mm0, %mm1
239	psrlq	%mm7, %mm0
240
241	por	%mm0, %mm2
242	psllq	%mm6, %mm1
243
244	movq	%mm2, 24(%edx,%eax,4)
245	jz	L(finish_even)
246
247
248	C Size is odd, so mm1 and one extra limb to process.
249
250	movd	(%ebx), %mm0		C src[0]
251	popl	%ebx
252deflit(`FRAME',0)
253
254	movq	%mm0, %mm2
255	psllq	$32, %mm0
256
257	psrlq	%mm7, %mm0
258
259	psllq	%mm6, %mm2
260	por	%mm0, %mm1
261
262	movq	%mm1, 4(%edx)		C dst[1,2]
263	movd	%mm2, (%edx)		C dst[0]
264
265	movl	VAR_RETVAL, %eax
266
267	femms
268	ret
269
270
271	nop	C avoid bad cache line crossing
272L(finish_even):
273deflit(`FRAME',4)
274	C Size is even, so only mm1 left to process.
275
276	movq	%mm1, (%edx)		C dst[0,1]
277	movl	VAR_RETVAL, %eax
278
279	popl	%ebx
280	femms
281	ret
282
283EPILOGUE()
284