1dnl  Intel Pentium mpn_lshift -- mpn left shift.
2
3dnl  Copyright 1992, 1994, 1995, 1996, 1999, 2000, 2002 Free Software
4dnl  Foundation, Inc.
5dnl
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or
9dnl  modify it under the terms of the GNU Lesser General Public License as
10dnl  published by the Free Software Foundation; either version 3 of the
11dnl  License, or (at your option) any later version.
12dnl
13dnl  The GNU MP Library is distributed in the hope that it will be useful,
14dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
15dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16dnl  Lesser General Public License for more details.
17dnl
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23
24C         cycles/limb
25C P5,P54:    6.0
26C P55:       5.375
27
28
29C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
30C                       unsigned shift);
31C
32C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
33C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
34
35defframe(PARAM_SHIFT,16)
36defframe(PARAM_SIZE, 12)
37defframe(PARAM_SRC,  8)
38defframe(PARAM_DST,  4)
39
40	TEXT
41	ALIGN(8)
42PROLOGUE(mpn_lshift)
43
44	pushl	%edi
45	pushl	%esi
46	pushl	%ebx
47	pushl	%ebp
48deflit(`FRAME',16)
49
50	movl	PARAM_DST,%edi
51	movl	PARAM_SRC,%esi
52	movl	PARAM_SIZE,%ebp
53	movl	PARAM_SHIFT,%ecx
54
55C We can use faster code for shift-by-1 under certain conditions.
56	cmp	$1,%ecx
57	jne	L(normal)
58	leal	4(%esi),%eax
59	cmpl	%edi,%eax
60	jnc	L(special)		C jump if s_ptr + 1 >= res_ptr
61	leal	(%esi,%ebp,4),%eax
62	cmpl	%eax,%edi
63	jnc	L(special)		C jump if res_ptr >= s_ptr + size
64
65L(normal):
66	leal	-4(%edi,%ebp,4),%edi
67	leal	-4(%esi,%ebp,4),%esi
68
69	movl	(%esi),%edx
70	subl	$4,%esi
71	xorl	%eax,%eax
72	shldl(	%cl, %edx, %eax)	C compute carry limb
73	pushl	%eax			C push carry limb onto stack
74
75	decl	%ebp
76	pushl	%ebp
77	shrl	$3,%ebp
78	jz	L(end)
79
80	movl	(%edi),%eax		C fetch destination cache line
81
82	ALIGN(4)
83L(oop):	movl	-28(%edi),%eax		C fetch destination cache line
84	movl	%edx,%ebx
85
86	movl	(%esi),%eax
87	movl	-4(%esi),%edx
88	shldl(	%cl, %eax, %ebx)
89	shldl(	%cl, %edx, %eax)
90	movl	%ebx,(%edi)
91	movl	%eax,-4(%edi)
92
93	movl	-8(%esi),%ebx
94	movl	-12(%esi),%eax
95	shldl(	%cl, %ebx, %edx)
96	shldl(	%cl, %eax, %ebx)
97	movl	%edx,-8(%edi)
98	movl	%ebx,-12(%edi)
99
100	movl	-16(%esi),%edx
101	movl	-20(%esi),%ebx
102	shldl(	%cl, %edx, %eax)
103	shldl(	%cl, %ebx, %edx)
104	movl	%eax,-16(%edi)
105	movl	%edx,-20(%edi)
106
107	movl	-24(%esi),%eax
108	movl	-28(%esi),%edx
109	shldl(	%cl, %eax, %ebx)
110	shldl(	%cl, %edx, %eax)
111	movl	%ebx,-24(%edi)
112	movl	%eax,-28(%edi)
113
114	subl	$32,%esi
115	subl	$32,%edi
116	decl	%ebp
117	jnz	L(oop)
118
119L(end):	popl	%ebp
120	andl	$7,%ebp
121	jz	L(end2)
122L(oop2):
123	movl	(%esi),%eax
124	shldl(	%cl,%eax,%edx)
125	movl	%edx,(%edi)
126	movl	%eax,%edx
127	subl	$4,%esi
128	subl	$4,%edi
129	decl	%ebp
130	jnz	L(oop2)
131
132L(end2):
133	shll	%cl,%edx		C compute least significant limb
134	movl	%edx,(%edi)		C store it
135
136	popl	%eax			C pop carry limb
137
138	popl	%ebp
139	popl	%ebx
140	popl	%esi
141	popl	%edi
142	ret
143
144
145C We loop from least significant end of the arrays, which is only
146C permissable if the source and destination don't overlap, since the
147C function is documented to work for overlapping source and destination.
148
149L(special):
150	movl	(%esi),%edx
151	addl	$4,%esi
152
153	decl	%ebp
154	pushl	%ebp
155	shrl	$3,%ebp
156
157	addl	%edx,%edx
158	incl	%ebp
159	decl	%ebp
160	jz	L(Lend)
161
162	movl	(%edi),%eax		C fetch destination cache line
163
164	ALIGN(4)
165L(Loop):
166	movl	28(%edi),%eax		C fetch destination cache line
167	movl	%edx,%ebx
168
169	movl	(%esi),%eax
170	movl	4(%esi),%edx
171	adcl	%eax,%eax
172	movl	%ebx,(%edi)
173	adcl	%edx,%edx
174	movl	%eax,4(%edi)
175
176	movl	8(%esi),%ebx
177	movl	12(%esi),%eax
178	adcl	%ebx,%ebx
179	movl	%edx,8(%edi)
180	adcl	%eax,%eax
181	movl	%ebx,12(%edi)
182
183	movl	16(%esi),%edx
184	movl	20(%esi),%ebx
185	adcl	%edx,%edx
186	movl	%eax,16(%edi)
187	adcl	%ebx,%ebx
188	movl	%edx,20(%edi)
189
190	movl	24(%esi),%eax
191	movl	28(%esi),%edx
192	adcl	%eax,%eax
193	movl	%ebx,24(%edi)
194	adcl	%edx,%edx
195	movl	%eax,28(%edi)
196
197	leal	32(%esi),%esi		C use leal not to clobber carry
198	leal	32(%edi),%edi
199	decl	%ebp
200	jnz	L(Loop)
201
202L(Lend):
203	popl	%ebp
204	sbbl	%eax,%eax		C save carry in %eax
205	andl	$7,%ebp
206	jz	L(Lend2)
207	addl	%eax,%eax		C restore carry from eax
208L(Loop2):
209	movl	%edx,%ebx
210	movl	(%esi),%edx
211	adcl	%edx,%edx
212	movl	%ebx,(%edi)
213
214	leal	4(%esi),%esi		C use leal not to clobber carry
215	leal	4(%edi),%edi
216	decl	%ebp
217	jnz	L(Loop2)
218
219	jmp	L(L1)
220L(Lend2):
221	addl	%eax,%eax		C restore carry from eax
222L(L1):	movl	%edx,(%edi)		C store last limb
223
224	sbbl	%eax,%eax
225	negl	%eax
226
227	popl	%ebp
228	popl	%ebx
229	popl	%esi
230	popl	%edi
231	ret
232
233EPILOGUE()
234