mode1o.asm revision 1.1.1.1
1dnl  Intel Pentium mpn_modexact_1_odd -- exact division style remainder.
2
3dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C P5: 23.0 cycles/limb
24
25
26C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
27C                               mp_limb_t divisor);
28C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
29C                                mp_limb_t divisor, mp_limb_t carry);
30C
31C There seems no way to pair up the two lone instructions in the main loop.
32C
33C The special case for size==1 saves about 20 cycles (non-PIC), making it
34C the same as mpn_mod_1, and in fact making modexact faster than mod_1 at
35C all sizes.
36C
37C Alternatives:
38C
39C Using mmx for the multiplies might be possible, with pmullw and pmulhw
40C having just 3 cycle latencies, but carry bit handling would probably be
41C complicated.
42
43defframe(PARAM_CARRY,  16)
44defframe(PARAM_DIVISOR,12)
45defframe(PARAM_SIZE,   8)
46defframe(PARAM_SRC,    4)
47
48dnl  re-using parameter space
49define(VAR_INVERSE,`PARAM_SIZE')
50
51	TEXT
52
53	ALIGN(16)
54PROLOGUE(mpn_modexact_1c_odd)
55deflit(`FRAME',0)
56
57	movl	PARAM_DIVISOR, %eax
58	movl	PARAM_CARRY, %edx
59
60	jmp	L(start_1c)
61
62EPILOGUE()
63
64	ALIGN(16)
65PROLOGUE(mpn_modexact_1_odd)
66deflit(`FRAME',0)
67
68	movl	PARAM_DIVISOR, %eax
69	xorl	%edx, %edx		C carry
70
71L(start_1c):
72
73ifdef(`PIC',`
74	call	L(here)		FRAME_pushl()
75L(here):
76
77	shrl	%eax			C d/2
78	movl	(%esp), %ecx		C eip
79
80	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ecx
81	movl	%ebx, (%esp)		C push ebx
82
83	andl	$127, %eax
84	movl	PARAM_SIZE, %ebx
85
86	movl	binvert_limb_table@GOT(%ecx), %ecx
87	subl	$2, %ebx
88
89	movb	(%eax,%ecx), %cl			C inv 8 bits
90	jc	L(one_limb)
91
92',`
93dnl non-PIC
94	shrl	%eax			C d/2
95	pushl	%ebx		FRAME_pushl()
96
97	movl	PARAM_SIZE, %ebx
98	andl	$127, %eax
99
100	subl	$2, %ebx
101	jc	L(one_limb)
102
103	movb	binvert_limb_table(%eax), %cl		C inv 8 bits
104')
105
106	movl	%ecx, %eax
107	addl	%ecx, %ecx		C 2*inv
108
109	imull	%eax, %eax		C inv*inv
110
111	imull	PARAM_DIVISOR, %eax	C inv*inv*d
112
113	subl	%eax, %ecx		C inv = 2*inv - inv*inv*d
114
115	movl	%ecx, %eax
116	addl	%ecx, %ecx		C 2*inv
117
118	imull	%eax, %eax		C inv*inv
119
120	imull	PARAM_DIVISOR, %eax	C inv*inv*d
121
122	subl	%eax, %ecx		C inv = 2*inv - inv*inv*d
123	pushl	%esi		FRAME_pushl()
124
125	ASSERT(e,`	C d*inv == 1 mod 2^GMP_LIMB_BITS
126	movl	%ecx, %eax
127	imull	PARAM_DIVISOR, %eax
128	cmpl	$1, %eax')
129
130	movl	PARAM_SRC, %esi
131	movl	%ecx, VAR_INVERSE
132
133	movl	(%esi), %eax		C src[0]
134	leal	4(%esi,%ebx,4), %esi	C &src[size-1]
135
136	xorl	$-1, %ebx		C -(size-1)
137	ASSERT(nz)
138	jmp	L(entry)
139
140
141C The use of VAR_INVERSE means only a store is needed for that value, rather
142C than a push and pop of say %edi.
143
144	ALIGN(16)
145L(top):
146	C eax	scratch, low product
147	C ebx	counter, limbs, negative
148	C ecx	carry bit
149	C edx	scratch, high product
150	C esi	&src[size-1]
151	C edi
152	C ebp
153
154	mull	PARAM_DIVISOR		C h:dummy = q*d
155
156	movl	(%esi,%ebx,4), %eax	C src[i]
157	subl	%ecx, %edx		C h -= -c
158
159L(entry):
160	subl	%edx, %eax		C s = src[i] - h
161
162	sbbl	%ecx, %ecx		C new -c (0 or -1)
163
164	imull	VAR_INVERSE, %eax	C q = s*i
165
166	incl	%ebx
167	jnz	L(top)
168
169
170	mull	PARAM_DIVISOR
171
172	movl	(%esi), %eax		C src high
173	subl	%ecx, %edx		C h -= -c
174
175	cmpl	PARAM_DIVISOR, %eax
176
177	jbe	L(skip_last)
178deflit(FRAME_LAST,FRAME)
179
180
181	subl	%edx, %eax		C s = src[i] - h
182	popl	%esi		FRAME_popl()
183
184	sbbl	%ecx, %ecx		C c (0 or -1)
185	popl	%ebx		FRAME_popl()
186
187	imull	VAR_INVERSE, %eax	C q = s*i
188
189	mull	PARAM_DIVISOR		C h:dummy = q*d
190
191	movl	%edx, %eax
192
193	subl	%ecx, %eax
194
195	ret
196
197
198C When high<divisor can skip last step.
199
200L(skip_last):
201deflit(`FRAME',FRAME_LAST)
202	C eax	src high
203	C ebx
204	C ecx
205	C edx	r
206	C esi
207
208	subl	%eax, %edx	C r-s
209	popl	%esi		FRAME_popl()
210
211	sbbl	%eax, %eax	C -1 if underflow
212	movl	PARAM_DIVISOR, %ebx
213
214	andl	%ebx, %eax	C divisor if underflow
215	popl	%ebx		FRAME_popl()
216
217	addl	%edx, %eax	C addback if underflow
218
219	ret
220
221
222C Special case for size==1 using a division for r = c-a mod d.
223C Could look for a-c<d and save a division sometimes, but that doesn't seem
224C worth bothering about.
225
226L(one_limb):
227deflit(`FRAME',4)
228	C eax
229	C ebx	size-2 (==-1)
230	C ecx
231	C edx	carry
232	C esi	src end
233	C edi
234	C ebp
235
236	movl	%edx, %eax
237	movl	PARAM_SRC, %edx
238
239	movl	PARAM_DIVISOR, %ecx
240	popl	%ebx		FRAME_popl()
241
242	subl	(%edx), %eax		C c-a
243
244	sbbl	%edx, %edx
245	decl	%ecx			C d-1
246
247	andl	%ecx, %edx		C b*d+c-a if c<a, or c-a if c>=a
248
249	divl	PARAM_DIVISOR
250
251	movl	%edx, %eax
252
253	ret
254
255EPILOGUE()
256