1/* Pentium optimized __mpn_rshift --
2   Copyright (C) 1992, 94, 95, 96, 97, 98, 2000 Free Software Foundation, Inc.
3   This file is part of the GNU MP Library.
4
5   The GNU MP Library is free software; you can redistribute it and/or modify
6   it under the terms of the GNU Lesser General Public License as published by
7   the Free Software Foundation; either version 2.1 of the License, or (at your
8   option) any later version.
9
10   The GNU MP Library is distributed in the hope that it will be useful, but
11   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
13   License for more details.
14
15   You should have received a copy of the GNU Lesser General Public License
16   along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
17   the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
18   MA 02111-1307, USA. */
19
20#include "sysdep.h"
21#include "asm-syntax.h"
22#include "bp-sym.h"
23#include "bp-asm.h"
24
25#define PARMS	LINKAGE+16		/* space for 4 saved regs */
26#define RES	PARMS
27#define S	RES+PTR_SIZE
28#define SIZE	S+PTR_SIZE
29#define CNT	SIZE+4
30
31	.text
32ENTRY (BP_SYM (__mpn_rshift))
33	ENTER
34
35	pushl	%edi
36	pushl	%esi
37	pushl	%ebp
38	pushl	%ebx
39
40	movl	RES(%esp),%edi
41	movl	S(%esp),%esi
42	movl	SIZE(%esp),%ebx
43	movl	CNT(%esp),%ecx
44#if __BOUNDED_POINTERS__
45	shll	$2, %ebx		/* convert limbs to bytes */
46	CHECK_BOUNDS_BOTH_WIDE (%edi, RES(%esp), %ebx)
47	CHECK_BOUNDS_BOTH_WIDE (%esi, S(%esp), %ebx)
48	shrl	$2, %ebx
49#endif
50
51/* We can use faster code for shift-by-1 under certain conditions.  */
52	cmp	$1,%ecx
53	jne	L(normal)
54	leal	4(%edi),%eax
55	cmpl	%esi,%eax
56	jnc	L(special)		/* jump if res_ptr + 1 >= s_ptr */
57	leal	(%edi,%ebx,4),%eax
58	cmpl	%eax,%esi
59	jnc	L(special)		/* jump if s_ptr >= res_ptr + size */
60
61L(normal):
62	movl	(%esi),%edx
63	addl	$4,%esi
64	xorl	%eax,%eax
65	shrdl	%cl,%edx,%eax		/* compute carry limb */
66	pushl	%eax			/* push carry limb onto stack */
67
68	decl	%ebx
69	pushl	%ebx
70	shrl	$3,%ebx
71	jz	L(end)
72
73	movl	(%edi),%eax		/* fetch destination cache line */
74
75	ALIGN	(2)
76L(oop):	movl	28(%edi),%eax		/* fetch destination cache line */
77	movl	%edx,%ebp
78
79	movl	(%esi),%eax
80	movl	4(%esi),%edx
81	shrdl	%cl,%eax,%ebp
82	shrdl	%cl,%edx,%eax
83	movl	%ebp,(%edi)
84	movl	%eax,4(%edi)
85
86	movl	8(%esi),%ebp
87	movl	12(%esi),%eax
88	shrdl	%cl,%ebp,%edx
89	shrdl	%cl,%eax,%ebp
90	movl	%edx,8(%edi)
91	movl	%ebp,12(%edi)
92
93	movl	16(%esi),%edx
94	movl	20(%esi),%ebp
95	shrdl	%cl,%edx,%eax
96	shrdl	%cl,%ebp,%edx
97	movl	%eax,16(%edi)
98	movl	%edx,20(%edi)
99
100	movl	24(%esi),%eax
101	movl	28(%esi),%edx
102	shrdl	%cl,%eax,%ebp
103	shrdl	%cl,%edx,%eax
104	movl	%ebp,24(%edi)
105	movl	%eax,28(%edi)
106
107	addl	$32,%esi
108	addl	$32,%edi
109	decl	%ebx
110	jnz	L(oop)
111
112L(end):	popl	%ebx
113	andl	$7,%ebx
114	jz	L(end2)
115L(oop2):
116	movl	(%esi),%eax
117	shrdl	%cl,%eax,%edx		/* compute result limb */
118	movl	%edx,(%edi)
119	movl	%eax,%edx
120	addl	$4,%esi
121	addl	$4,%edi
122	decl	%ebx
123	jnz	L(oop2)
124
125L(end2):
126	shrl	%cl,%edx		/* compute most significant limb */
127	movl	%edx,(%edi)		/* store it */
128
129	popl	%eax			/* pop carry limb */
130
131	popl	%ebx
132	popl	%ebp
133	popl	%esi
134	popl	%edi
135
136	LEAVE
137	ret
138
139/* We loop from least significant end of the arrays, which is only
140   permissible if the source and destination don't overlap, since the
141   function is documented to work for overlapping source and destination.
142*/
143
144L(special):
145	leal	-4(%edi,%ebx,4),%edi
146	leal	-4(%esi,%ebx,4),%esi
147
148	movl	(%esi),%edx
149	subl	$4,%esi
150
151	decl	%ebx
152	pushl	%ebx
153	shrl	$3,%ebx
154
155	shrl	$1,%edx
156	incl	%ebx
157	decl	%ebx
158	jz	L(Lend)
159
160	movl	(%edi),%eax		/* fetch destination cache line */
161
162	ALIGN	(2)
163L(Loop):
164	movl	-28(%edi),%eax		/* fetch destination cache line */
165	movl	%edx,%ebp
166
167	movl	(%esi),%eax
168	movl	-4(%esi),%edx
169	rcrl	$1,%eax
170	movl	%ebp,(%edi)
171	rcrl	$1,%edx
172	movl	%eax,-4(%edi)
173
174	movl	-8(%esi),%ebp
175	movl	-12(%esi),%eax
176	rcrl	$1,%ebp
177	movl	%edx,-8(%edi)
178	rcrl	$1,%eax
179	movl	%ebp,-12(%edi)
180
181	movl	-16(%esi),%edx
182	movl	-20(%esi),%ebp
183	rcrl	$1,%edx
184	movl	%eax,-16(%edi)
185	rcrl	$1,%ebp
186	movl	%edx,-20(%edi)
187
188	movl	-24(%esi),%eax
189	movl	-28(%esi),%edx
190	rcrl	$1,%eax
191	movl	%ebp,-24(%edi)
192	rcrl	$1,%edx
193	movl	%eax,-28(%edi)
194
195	leal	-32(%esi),%esi		/* use leal not to clobber carry */
196	leal	-32(%edi),%edi
197	decl	%ebx
198	jnz	L(Loop)
199
200L(Lend):
201	popl	%ebx
202	sbbl	%eax,%eax		/* save carry in %eax */
203	andl	$7,%ebx
204	jz	L(Lend2)
205	addl	%eax,%eax		/* restore carry from eax */
206L(Loop2):
207	movl	%edx,%ebp
208	movl	(%esi),%edx
209	rcrl	$1,%edx
210	movl	%ebp,(%edi)
211
212	leal	-4(%esi),%esi		/* use leal not to clobber carry */
213	leal	-4(%edi),%edi
214	decl	%ebx
215	jnz	L(Loop2)
216
217	jmp	L(L1)
218L(Lend2):
219	addl	%eax,%eax		/* restore carry from eax */
220L(L1):	movl	%edx,(%edi)		/* store last limb */
221
222	movl	$0,%eax
223	rcrl	$1,%eax
224
225	popl	%ebx
226	popl	%ebp
227	popl	%esi
228	popl	%edi
229
230	LEAVE
231	ret
232END (BP_SYM (__mpn_rshift))
233