1dnl  AMD K6-2 mpn_rshift -- mpn right shift.
2
3dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C K6-2: 1.75 cycles/limb
24
25
26C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
27C                       unsigned shift);
28C
29
30defframe(PARAM_SHIFT,16)
31defframe(PARAM_SIZE, 12)
32defframe(PARAM_SRC,  8)
33defframe(PARAM_DST,  4)
34deflit(`FRAME',0)
35
36dnl  Minimum 9, because the unrolled loop can't handle less.
37dnl
38deflit(UNROLL_THRESHOLD, 9)
39
40	TEXT
41	ALIGN(32)
42
43PROLOGUE(mpn_rshift)
44deflit(`FRAME',0)
45
46	C The 1 limb case can be done without the push %ebx, but it's then
47	C still the same speed.  The push is left as a free helping hand for
48	C the two_or_more code.
49
50	movl	PARAM_SIZE, %eax
51	pushl	%ebx			FRAME_pushl()
52
53	movl	PARAM_SRC, %ebx
54	decl	%eax
55
56	movl	PARAM_SHIFT, %ecx
57	jnz	L(two_or_more)
58
59	movl	(%ebx), %edx		C src limb
60	movl	PARAM_DST, %ebx
61
62	shrdl(	%cl, %edx, %eax)	C return value
63
64	shrl	%cl, %edx
65
66	movl	%edx, (%ebx)		C dst limb
67	popl	%ebx
68
69	ret
70
71
72C -----------------------------------------------------------------------------
73	ALIGN(16)	C avoid offset 0x1f
74L(two_or_more):
75	C eax	size-1
76	C ebx	src
77	C ecx	shift
78	C edx
79
80	movl	(%ebx), %edx	C src low limb
81	negl	%ecx
82
83	addl	$32, %ecx
84	movd	PARAM_SHIFT, %mm6
85
86	shll	%cl, %edx
87	cmpl	$UNROLL_THRESHOLD-1, %eax
88
89	jae	L(unroll)
90
91
92	C eax	size-1
93	C ebx	src
94	C ecx	32-shift
95	C edx	retval
96	C
97	C mm6	shift
98
99	movl	PARAM_DST, %ecx
100	leal	(%ebx,%eax,4), %ebx
101
102	leal	-4(%ecx,%eax,4), %ecx
103	negl	%eax
104
105	C This loop runs at about 3 cycles/limb, which is the amount of
106	C decoding, and this is despite every second access being unaligned.
107
108L(simple):
109	C eax	counter, -(size-1) to -1
110	C ebx	&src[size-1]
111	C ecx	&dst[size-1]
112	C edx	retval
113	C
114	C mm0	scratch
115	C mm6	shift
116
117Zdisp(	movq,	0,(%ebx,%eax,4), %mm0)
118	incl	%eax
119
120	psrlq	%mm6, %mm0
121
122Zdisp(	movd,	%mm0, 0,(%ecx,%eax,4))
123	jnz	L(simple)
124
125
126	movq	%mm0, (%ecx)
127	movl	%edx, %eax
128
129	popl	%ebx
130
131	femms
132	ret
133
134
135C -----------------------------------------------------------------------------
136	ALIGN(16)
137L(unroll):
138	C eax	size-1
139	C ebx	src
140	C ecx	32-shift
141	C edx	retval
142	C
143	C mm6	shift
144
145	addl	$32, %ecx
146	subl	$7, %eax		C size-8
147
148	movd	%ecx, %mm7
149	movl	PARAM_DST, %ecx
150
151	movq	(%ebx), %mm2		C src low qword
152	leal	(%ebx,%eax,4), %ebx	C src end - 32
153
154	testb	$4, %cl
155	leal	(%ecx,%eax,4), %ecx	C dst end - 32
156
157	notl	%eax			C -(size-7)
158	jz	L(dst_aligned)
159
160	psrlq	%mm6, %mm2
161	incl	%eax
162
163Zdisp(	movd,	%mm2, 0,(%ecx,%eax,4))	C dst low limb
164	movq	4(%ebx,%eax,4), %mm2	C new src low qword
165L(dst_aligned):
166
167	movq	12(%ebx,%eax,4), %mm0	C src second lowest qword
168	nop	C avoid bad cache line crossing
169
170
171	C This loop is the important bit, the rest is just support for it.
172	C Four src limbs are held at the start, and four more will be read.
173	C Four dst limbs will be written.  This schedule seems necessary for
174	C full speed.
175	C
176	C The use of -(size-7) lets the loop stop when %eax becomes >= 0 and
177	C and leaves 0 to 3 which can be tested with test $1 and $2.
178
179L(top):
180	C eax	counter, -(size-7) step by +4 until >=0
181	C ebx	src end - 32
182	C ecx	dst end - 32
183	C edx	retval
184	C
185	C mm0	src next qword
186	C mm1	scratch
187	C mm2	src prev qword
188	C mm6	shift
189	C mm7	64-shift
190
191	psrlq	%mm6, %mm2
192	addl	$4, %eax
193
194	movq	%mm0, %mm1
195	psllq	%mm7, %mm0
196
197	por	%mm0, %mm2
198	movq	4(%ebx,%eax,4), %mm0
199
200	psrlq	%mm6, %mm1
201	movq	%mm2, -12(%ecx,%eax,4)
202
203	movq	%mm0, %mm2
204	psllq	%mm7, %mm0
205
206	por	%mm0, %mm1
207	movq	12(%ebx,%eax,4), %mm0
208
209	movq	%mm1, -4(%ecx,%eax,4)
210	ja	L(top)		C jump if no carry and not zero
211
212
213
214	C Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0
215	C to 3 representing respectively 3 to 0 further limbs.
216
217	testl	$2, %eax	C testl to avoid bad cache line crossings
218	jnz	L(finish_nottwo)
219
220	C Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0
221	C becomes new mm2 and a new mm0 is loaded.
222
223	psrlq	%mm6, %mm2
224	movq	%mm0, %mm1
225
226	psllq	%mm7, %mm0
227	addl	$2, %eax
228
229	por	%mm0, %mm2
230	movq	12(%ebx,%eax,4), %mm0
231
232	movq	%mm2, -4(%ecx,%eax,4)
233	movq	%mm1, %mm2
234L(finish_nottwo):
235
236
237	testb	$1, %al
238	psrlq	%mm6, %mm2
239
240	movq	%mm0, %mm1
241	psllq	%mm7, %mm0
242
243	por	%mm0, %mm2
244	psrlq	%mm6, %mm1
245
246	movq	%mm2, 4(%ecx,%eax,4)
247	jnz	L(finish_even)
248
249
250	C one further extra limb to process
251
252	movd	32-4(%ebx), %mm0	C src[size-1], most significant limb
253	popl	%ebx
254
255	movq	%mm0, %mm2
256	psllq	%mm7, %mm0
257
258	por	%mm0, %mm1
259	psrlq	%mm6, %mm2
260
261	movq	%mm1, 32-12(%ecx)	C dst[size-3,size-2]
262	movd	%mm2, 32-4(%ecx)	C dst[size-1]
263
264	movl	%edx, %eax		C retval
265
266	femms
267	ret
268
269
270	nop	C avoid bad cache line crossing
271L(finish_even):
272	C no further extra limbs
273
274	movq	%mm1, 32-8(%ecx)	C dst[size-2,size-1]
275	movl	%edx, %eax		C retval
276
277	popl	%ebx
278
279	femms
280	ret
281
282EPILOGUE()
283