1dnl  AMD K6-2 mpn_lshift -- mpn left shift.
2
3dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C K6-2: 1.75 cycles/limb
35
36
37C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
38C                       unsigned shift);
39C
40
41defframe(PARAM_SHIFT,16)
42defframe(PARAM_SIZE, 12)
43defframe(PARAM_SRC,  8)
44defframe(PARAM_DST,  4)
45deflit(`FRAME',0)
46
47dnl  used after src has been fetched
48define(VAR_RETVAL,`PARAM_SRC')
49
50dnl  minimum 9, because unrolled loop can't handle less
51deflit(UNROLL_THRESHOLD, 9)
52
53	TEXT
54	ALIGN(32)
55
56PROLOGUE(mpn_lshift)
57deflit(`FRAME',0)
58
59	C The 1 limb case can be done without the push %ebx, but it's then
60	C still the same speed.  The push is left as a free helping hand for
61	C the two_or_more code.
62
63	movl	PARAM_SIZE, %eax
64	pushl	%ebx			FRAME_pushl()
65
66	movl	PARAM_SRC, %ebx
67	decl	%eax
68
69	movl	PARAM_SHIFT, %ecx
70	jnz	L(two_or_more)
71
72	movl	(%ebx), %edx		C src limb
73	movl	PARAM_DST, %ebx
74
75	shldl(	%cl, %edx, %eax)	C return value
76
77	shll	%cl, %edx
78
79	movl	%edx, (%ebx)		C dst limb
80	popl	%ebx
81
82	ret
83
84
85C -----------------------------------------------------------------------------
86	ALIGN(16)	C avoid offset 0x1f
87L(two_or_more):
88	C eax	size-1
89	C ebx	src
90	C ecx	shift
91	C edx
92
93	movl	(%ebx,%eax,4), %edx	C src high limb
94	negl	%ecx
95
96	movd	PARAM_SHIFT, %mm6
97	addl	$32, %ecx		C 32-shift
98
99	shrl	%cl, %edx
100	cmpl	$UNROLL_THRESHOLD-1, %eax
101
102	movl	%edx, VAR_RETVAL
103	jae	L(unroll)
104
105
106	movd	%ecx, %mm7
107	movl	%eax, %ecx
108
109	movl	PARAM_DST, %eax
110
111L(simple):
112	C eax	dst
113	C ebx	src
114	C ecx	counter, size-1 to 1
115	C edx	retval
116	C
117	C mm0	scratch
118	C mm6	shift
119	C mm7	32-shift
120
121	movq	-4(%ebx,%ecx,4), %mm0
122
123	psrlq	%mm7, %mm0
124
125Zdisp(	movd,	%mm0, 0,(%eax,%ecx,4))
126	loop	L(simple)
127
128
129	movd	(%ebx), %mm0
130	popl	%ebx
131
132	psllq	%mm6, %mm0
133
134	movd	%mm0, (%eax)
135	movl	%edx, %eax
136
137	femms
138	ret
139
140
141C -----------------------------------------------------------------------------
142	ALIGN(16)
143L(unroll):
144	C eax	size-1
145	C ebx	src
146	C ecx	32-shift
147	C edx	retval (but instead VAR_RETVAL is used)
148	C
149	C mm6	shift
150
151	addl	$32, %ecx
152	movl	PARAM_DST, %edx
153
154	movd	%ecx, %mm7
155	subl	$7, %eax			C size-8
156
157	leal	(%edx,%eax,4), %ecx		C alignment of dst
158
159	movq	32-8(%ebx,%eax,4), %mm2		C src high qword
160	testb	$4, %cl
161
162	jz	L(dst_aligned)
163	psllq	%mm6, %mm2
164
165	psrlq	$32, %mm2
166	decl	%eax
167
168	movd	%mm2, 32(%edx,%eax,4)		C dst high limb
169	movq	32-8(%ebx,%eax,4), %mm2		C new src high qword
170L(dst_aligned):
171
172	movq	32-16(%ebx,%eax,4), %mm0	C src second highest qword
173
174
175	C This loop is the important bit, the rest is just support for it.
176	C Four src limbs are held at the start, and four more will be read.
177	C Four dst limbs will be written.  This schedule seems necessary for
178	C full speed.
179	C
180	C The use of size-8 lets the loop stop when %eax goes negative and
181	C leaves -4 to -1 which can be tested with test $1 and $2.
182
183L(top):
184	C eax	counter, size-8 step by -4 until <0
185	C ebx	src
186	C ecx
187	C edx	dst
188	C
189	C mm0	src next qword
190	C mm1	scratch
191	C mm2	src prev qword
192	C mm6	shift
193	C mm7	64-shift
194
195	psllq	%mm6, %mm2
196	subl	$4, %eax
197
198	movq	%mm0, %mm1
199	psrlq	%mm7, %mm0
200
201	por	%mm0, %mm2
202	movq	24(%ebx,%eax,4), %mm0
203
204	psllq	%mm6, %mm1
205	movq	%mm2, 40(%edx,%eax,4)
206
207	movq	%mm0, %mm2
208	psrlq	%mm7, %mm0
209
210	por	%mm0, %mm1
211	movq	16(%ebx,%eax,4), %mm0
212
213	movq	%mm1, 32(%edx,%eax,4)
214	jnc	L(top)
215
216
217	C Now have four limbs in mm2 (prev) and mm0 (next), plus eax mod 4.
218	C
219	C 8(%ebx) is the next source, and 24(%edx) is the next destination.
220	C %eax is between -4 and -1, representing respectively 0 to 3 extra
221	C limbs that must be read.
222
223
224	testl	$2, %eax	C testl to avoid bad cache line crossing
225	jz	L(finish_nottwo)
226
227	C Two more limbs: lshift mm2, OR it with rshifted mm0, mm0 becomes
228	C new mm2 and a new mm0 is loaded.
229
230	psllq	%mm6, %mm2
231	movq	%mm0, %mm1
232
233	psrlq	%mm7, %mm0
234	subl	$2, %eax
235
236	por	%mm0, %mm2
237	movq	16(%ebx,%eax,4), %mm0
238
239	movq	%mm2, 32(%edx,%eax,4)
240	movq	%mm1, %mm2
241L(finish_nottwo):
242
243
244	C lshift mm2, OR with rshifted mm0, mm1 becomes lshifted mm0
245
246	testb	$1, %al
247	psllq	%mm6, %mm2
248
249	movq	%mm0, %mm1
250	psrlq	%mm7, %mm0
251
252	por	%mm0, %mm2
253	psllq	%mm6, %mm1
254
255	movq	%mm2, 24(%edx,%eax,4)
256	jz	L(finish_even)
257
258
259	C Size is odd, so mm1 and one extra limb to process.
260
261	movd	(%ebx), %mm0		C src[0]
262	popl	%ebx
263deflit(`FRAME',0)
264
265	movq	%mm0, %mm2
266	psllq	$32, %mm0
267
268	psrlq	%mm7, %mm0
269
270	psllq	%mm6, %mm2
271	por	%mm0, %mm1
272
273	movq	%mm1, 4(%edx)		C dst[1,2]
274	movd	%mm2, (%edx)		C dst[0]
275
276	movl	VAR_RETVAL, %eax
277
278	femms
279	ret
280
281
282	nop	C avoid bad cache line crossing
283L(finish_even):
284deflit(`FRAME',4)
285	C Size is even, so only mm1 left to process.
286
287	movq	%mm1, (%edx)		C dst[0,1]
288	movl	VAR_RETVAL, %eax
289
290	popl	%ebx
291	femms
292	ret
293
294EPILOGUE()
295