1dnl  AMD K6 mpn_rshift -- mpn right shift.
2
3dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C K6: 3.0 cycles/limb
24
25
26C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
27C                       unsigned shift);
28C
29C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
30C instructions.  This is despite every second fetch being unaligned.
31
32
33defframe(PARAM_SHIFT,16)
34defframe(PARAM_SIZE, 12)
35defframe(PARAM_SRC,  8)
36defframe(PARAM_DST,  4)
37deflit(`FRAME',0)
38
39	TEXT
40	ALIGN(32)
41
42PROLOGUE(mpn_rshift)
43deflit(`FRAME',0)
44
45	C The 1 limb case can be done without the push %ebx, but it's then
46	C still the same speed.  The push is left as a free helping hand for
47	C the two_or_more code.
48
49	movl	PARAM_SIZE, %eax
50	pushl	%ebx			FRAME_pushl()
51
52	movl	PARAM_SRC, %ebx
53	decl	%eax
54
55	movl	PARAM_SHIFT, %ecx
56	jnz	L(two_or_more)
57
58	movl	(%ebx), %edx		C src limb
59	movl	PARAM_DST, %ebx
60
61	shrdl(	%cl, %edx, %eax)	C return value
62
63	shrl	%cl, %edx
64
65	movl	%edx, (%ebx)		C dst limb
66	popl	%ebx
67
68	ret
69
70
71	ALIGN(16)	C avoid offset 0x1f
72L(two_or_more):
73	C eax	size-1
74	C ebx	src
75	C ecx	shift
76	C edx
77
78	movl	(%ebx), %edx	C src low limb
79	negl	%ecx
80
81	addl	$32, %ecx	C 32-shift
82	movd	PARAM_SHIFT, %mm6
83
84	shll	%cl, %edx	C retval
85	movl	PARAM_DST, %ecx
86
87	leal	(%ebx,%eax,4), %ebx
88
89	leal	-4(%ecx,%eax,4), %ecx
90	negl	%eax
91
92
93L(simple):
94	C eax	counter (negative)
95	C ebx	&src[size-1]
96	C ecx	&dst[size-1]
97	C edx	retval
98	C
99	C mm0	scratch
100	C mm6	shift
101
102Zdisp(	movq,	0,(%ebx,%eax,4), %mm0)
103	incl	%eax
104
105	psrlq	%mm6, %mm0
106
107Zdisp(	movd,	%mm0, 0,(%ecx,%eax,4))
108	jnz	L(simple)
109
110
111	movq	%mm0, (%ecx)
112	movl	%edx, %eax
113
114	popl	%ebx
115
116	emms
117	ret
118
119EPILOGUE()
120