1dnl  AMD K6 mpn_lshift -- mpn left shift.
2
3dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C K6: 3.0 cycles/limb
24
25
26C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
27C                       unsigned shift);
28C
29C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
30C instructions.  This is despite every second fetch being unaligned.
31
32
33defframe(PARAM_SHIFT,16)
34defframe(PARAM_SIZE, 12)
35defframe(PARAM_SRC,  8)
36defframe(PARAM_DST,  4)
37
38	TEXT
39	ALIGN(32)
40
41PROLOGUE(mpn_lshift)
42deflit(`FRAME',0)
43
44	C The 1 limb case can be done without the push %ebx, but it's then
45	C still the same speed.  The push is left as a free helping hand for
46	C the two_or_more code.
47
48	movl	PARAM_SIZE, %eax
49	pushl	%ebx			FRAME_pushl()
50
51	movl	PARAM_SRC, %ebx
52	decl	%eax
53
54	movl	PARAM_SHIFT, %ecx
55	jnz	L(two_or_more)
56
57	movl	(%ebx), %edx		C src limb
58	movl	PARAM_DST, %ebx
59
60	shldl(	%cl, %edx, %eax)	C return value
61
62	shll	%cl, %edx
63
64	movl	%edx, (%ebx)		C dst limb
65	popl	%ebx
66
67	ret
68
69
70	ALIGN(16)	C avoid offset 0x1f
71	nop		C avoid bad cache line crossing
72L(two_or_more):
73	C eax	size-1
74	C ebx	src
75	C ecx	shift
76	C edx
77
78	movl	(%ebx,%eax,4), %edx	C src high limb
79	negl	%ecx
80
81	movd	PARAM_SHIFT, %mm6
82	addl	$32, %ecx		C 32-shift
83
84	shrl	%cl, %edx
85
86	movd	%ecx, %mm7
87	movl	PARAM_DST, %ecx
88
89L(top):
90	C eax	counter, size-1 to 1
91	C ebx	src
92	C ecx	dst
93	C edx	retval
94	C
95	C mm0	scratch
96	C mm6	shift
97	C mm7	32-shift
98
99	movq	-4(%ebx,%eax,4), %mm0
100	decl	%eax
101
102	psrlq	%mm7, %mm0
103
104	movd	%mm0, 4(%ecx,%eax,4)
105	jnz	L(top)
106
107
108	movd	(%ebx), %mm0
109	popl	%ebx
110
111	psllq	%mm6, %mm0
112	movl	%edx, %eax
113
114	movd	%mm0, (%ecx)
115
116	emms
117	ret
118
119EPILOGUE()
120