dive_1.asm revision 1.1.1.1
1dnl  Intel P6 mpn_modexact_1_odd -- exact division style remainder.
2
3dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C       odd  even  divisor
24C P6:  10.0  12.0  cycles/limb
25
26
27C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
28C                      mp_limb_t divisor);
29C
30C The odd case is basically the same as mpn_modexact_1_odd, just with an
31C extra store, and it runs at the same 10 cycles which is the dependent
32C chain.
33C
34C The shifts for the even case aren't on the dependent chain so in principle
35C it could run the same too, but nothing running at 10 has been found.
36C Perhaps there's too many uops (an extra 4 over the odd case).
37
38defframe(PARAM_DIVISOR,16)
39defframe(PARAM_SIZE,   12)
40defframe(PARAM_SRC,     8)
41defframe(PARAM_DST,     4)
42
43defframe(SAVE_EBX,     -4)
44defframe(SAVE_ESI,     -8)
45defframe(SAVE_EDI,    -12)
46defframe(SAVE_EBP,    -16)
47defframe(VAR_INVERSE, -20)
48deflit(STACK_SPACE, 20)
49
50	TEXT
51
52	ALIGN(16)
53PROLOGUE(mpn_divexact_1)
54deflit(`FRAME',0)
55
56	movl	PARAM_DIVISOR, %eax
57	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
58
59	movl	%esi, SAVE_ESI
60	movl	PARAM_SRC, %esi
61
62	movl	%ebx, SAVE_EBX
63	movl	PARAM_SIZE, %ebx
64
65	bsfl	%eax, %ecx		C trailing twos
66
67	movl	%ebp, SAVE_EBP
68
69	shrl	%cl, %eax		C d without twos
70
71	movl	%eax, %edx
72	shrl	%eax			C d/2 without twos
73
74	movl	%edx, PARAM_DIVISOR
75	andl	$127, %eax
76
77ifdef(`PIC',`
78	LEA(	binvert_limb_table, %ebp)
79	movzbl	(%eax,%ebp), %ebp		C inv 8 bits
80',`
81	movzbl	binvert_limb_table(%eax), %ebp	C inv 8 bits
82')
83
84	leal	(%ebp,%ebp), %eax	C 2*inv
85
86	imull	%ebp, %ebp		C inv*inv
87
88	movl	%edi, SAVE_EDI
89	movl	PARAM_DST, %edi
90
91	leal	(%esi,%ebx,4), %esi	C src end
92
93	imull	PARAM_DIVISOR, %ebp	C inv*inv*d
94
95	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
96	leal	(%eax,%eax), %ebp	C 2*inv
97
98	imull	%eax, %eax		C inv*inv
99
100	leal	(%edi,%ebx,4), %edi	C dst end
101	negl	%ebx			C -size
102
103	movl	%edi, PARAM_DST
104
105	imull	PARAM_DIVISOR, %eax	C inv*inv*d
106
107	subl	%eax, %ebp		C inv = 2*inv - inv*inv*d
108
109	ASSERT(e,`	C d*inv == 1 mod 2^GMP_LIMB_BITS
110	movl	PARAM_DIVISOR, %eax
111	imull	%ebp, %eax
112	cmpl	$1, %eax')
113
114	movl	%ebp, VAR_INVERSE
115	movl	(%esi,%ebx,4), %eax	C src[0]
116
117	orl	%ecx, %ecx
118	jnz	L(even)
119
120	C ecx initial carry is zero
121	jmp	L(odd_entry)
122
123
124C The dependent chain here is
125C
126C	subl	%edx, %eax       1
127C	imull	%ebp, %eax       4
128C	mull	PARAM_DIVISOR    5
129C			       ----
130C       total		        10
131C
132C and this is the measured speed.  No special scheduling is necessary, out
133C of order execution hides the load latency.
134
135L(odd_top):
136	C eax	scratch (src limb)
137	C ebx	counter, limbs, negative
138	C ecx	carry bit
139	C edx	carry limb, high of last product
140	C esi	&src[size]
141	C edi	&dst[size]
142	C ebp
143
144	mull	PARAM_DIVISOR
145
146	movl	(%esi,%ebx,4), %eax
147	subl	%ecx, %eax
148
149	sbbl	%ecx, %ecx
150	subl	%edx, %eax
151
152	sbbl	$0, %ecx
153
154L(odd_entry):
155	imull	VAR_INVERSE, %eax
156
157	movl	%eax, (%edi,%ebx,4)
158	negl	%ecx
159
160	incl	%ebx
161	jnz	L(odd_top)
162
163
164	movl	SAVE_ESI, %esi
165
166	movl	SAVE_EDI, %edi
167
168	movl	SAVE_EBP, %ebp
169
170	movl	SAVE_EBX, %ebx
171	addl	$STACK_SPACE, %esp
172
173	ret
174
175
176L(even):
177	C eax	src[0]
178	C ebx	counter, limbs, negative
179	C ecx	shift
180	C edx
181	C esi
182	C edi
183	C ebp
184
185	xorl	%ebp, %ebp		C initial carry bit
186	xorl	%edx, %edx		C initial carry limb (for size==1)
187
188	incl	%ebx
189	jz	L(even_one)
190
191	movl	(%esi,%ebx,4), %edi	C src[1]
192
193	shrdl(	%cl, %edi, %eax)
194
195	jmp	L(even_entry)
196
197
198L(even_top):
199	C eax	scratch
200	C ebx	counter, limbs, negative
201	C ecx	shift
202	C edx	scratch
203	C esi	&src[size]
204	C edi	&dst[size] and scratch
205	C ebp	carry bit
206
207	movl	(%esi,%ebx,4), %edi
208
209	mull	PARAM_DIVISOR
210
211	movl	-4(%esi,%ebx,4), %eax
212	shrdl(	%cl, %edi, %eax)
213
214	subl	%ebp, %eax
215
216	sbbl	%ebp, %ebp
217	subl	%edx, %eax
218
219	sbbl	$0, %ebp
220
221L(even_entry):
222	imull	VAR_INVERSE, %eax
223
224	movl	PARAM_DST, %edi
225	negl	%ebp
226
227	movl	%eax, -4(%edi,%ebx,4)
228	incl	%ebx
229	jnz	L(even_top)
230
231
232
233	mull	PARAM_DIVISOR
234
235	movl	-4(%esi), %eax
236
237L(even_one):
238	shrl	%cl, %eax
239	movl	SAVE_ESI, %esi
240
241	subl	%ebp, %eax
242	movl	SAVE_EBP, %ebp
243
244	subl	%edx, %eax
245	movl	SAVE_EBX, %ebx
246
247	imull	VAR_INVERSE, %eax
248
249	movl	%eax, -4(%edi)
250	movl	SAVE_EDI, %edi
251	addl	$STACK_SPACE, %esp
252
253	ret
254
255EPILOGUE()
256