bdiv_q_1.asm revision 1.1.1.1
1dnl  Intel P6 mpn_modexact_1_odd -- exact division style remainder.
2
3dnl  Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  Rearranged from mpn/x86/p6/dive_1.asm by Marco Bodrato.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or
10dnl  modify it under the terms of the GNU Lesser General Public License as
11dnl  published by the Free Software Foundation; either version 3 of the
12dnl  License, or (at your option) any later version.
13dnl
14dnl  The GNU MP Library is distributed in the hope that it will be useful,
15dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
16dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17dnl  Lesser General Public License for more details.
18dnl
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24
25C       odd  even  divisor
26C P6:  10.0  12.0  cycles/limb
27
28C MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
29
30C The odd case is basically the same as mpn_modexact_1_odd, just with an
31C extra store, and it runs at the same 10 cycles which is the dependent
32C chain.
33C
34C The shifts for the even case aren't on the dependent chain so in principle
35C it could run the same too, but nothing running at 10 has been found.
36C Perhaps there's too many uops (an extra 4 over the odd case).
37
38defframe(PARAM_SHIFT,  24)
39defframe(PARAM_INVERSE,20)
40defframe(PARAM_DIVISOR,16)
41defframe(PARAM_SIZE,   12)
42defframe(PARAM_SRC,     8)
43defframe(PARAM_DST,     4)
44
45defframe(SAVE_EBX,     -4)
46defframe(SAVE_ESI,     -8)
47defframe(SAVE_EDI,    -12)
48defframe(SAVE_EBP,    -16)
49deflit(STACK_SPACE, 16)
50
51dnl  re-use parameter space
52define(VAR_INVERSE,`PARAM_SRC')
53
54	TEXT
55
56C mp_limb_t
57C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
58C		    mp_limb_t inverse, int shift)
59
60	ALIGN(16)
61PROLOGUE(mpn_pi1_bdiv_q_1)
62deflit(`FRAME',0)
63
64	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
65
66	movl	%esi, SAVE_ESI
67	movl	PARAM_SRC, %esi
68
69	movl	%ebx, SAVE_EBX
70	movl	PARAM_SIZE, %ebx
71
72	movl	%ebp, SAVE_EBP
73	movl	PARAM_INVERSE, %ebp
74
75	movl	PARAM_SHIFT, %ecx	C trailing twos
76
77L(common):
78	movl	%edi, SAVE_EDI
79	movl	PARAM_DST, %edi
80
81	leal	(%esi,%ebx,4), %esi	C src end
82
83	leal	(%edi,%ebx,4), %edi	C dst end
84	negl	%ebx			C -size
85
86	movl	(%esi,%ebx,4), %eax	C src[0]
87
88	orl	%ecx, %ecx
89	jz	L(odd_entry)
90
91	movl	%edi, PARAM_DST
92	movl	%ebp, VAR_INVERSE
93
94L(even):
95	C eax	src[0]
96	C ebx	counter, limbs, negative
97	C ecx	shift
98	C edx
99	C esi
100	C edi
101	C ebp
102
103	xorl	%ebp, %ebp		C initial carry bit
104	xorl	%edx, %edx		C initial carry limb (for size==1)
105
106	incl	%ebx
107	jz	L(even_one)
108
109	movl	(%esi,%ebx,4), %edi	C src[1]
110
111	shrdl(	%cl, %edi, %eax)
112
113	jmp	L(even_entry)
114
115
116L(even_top):
117	C eax	scratch
118	C ebx	counter, limbs, negative
119	C ecx	shift
120	C edx	scratch
121	C esi	&src[size]
122	C edi	&dst[size] and scratch
123	C ebp	carry bit
124
125	movl	(%esi,%ebx,4), %edi
126
127	mull	PARAM_DIVISOR
128
129	movl	-4(%esi,%ebx,4), %eax
130	shrdl(	%cl, %edi, %eax)
131
132	subl	%ebp, %eax
133
134	sbbl	%ebp, %ebp
135	subl	%edx, %eax
136
137	sbbl	$0, %ebp
138
139L(even_entry):
140	imull	VAR_INVERSE, %eax
141
142	movl	PARAM_DST, %edi
143	negl	%ebp
144
145	movl	%eax, -4(%edi,%ebx,4)
146	incl	%ebx
147	jnz	L(even_top)
148
149	mull	PARAM_DIVISOR
150
151	movl	-4(%esi), %eax
152
153L(even_one):
154	shrl	%cl, %eax
155	movl	SAVE_ESI, %esi
156
157	subl	%ebp, %eax
158	movl	SAVE_EBP, %ebp
159
160	subl	%edx, %eax
161	movl	SAVE_EBX, %ebx
162
163	imull	VAR_INVERSE, %eax
164
165	movl	%eax, -4(%edi)
166	movl	SAVE_EDI, %edi
167	addl	$STACK_SPACE, %esp
168
169	ret
170
171C The dependent chain here is
172C
173C	subl	%edx, %eax       1
174C	imull	%ebp, %eax       4
175C	mull	PARAM_DIVISOR    5
176C			       ----
177C	total			10
178C
179C and this is the measured speed.  No special scheduling is necessary, out
180C of order execution hides the load latency.
181
182L(odd_top):
183	C eax	scratch (src limb)
184	C ebx	counter, limbs, negative
185	C ecx	carry bit
186	C edx	carry limb, high of last product
187	C esi	&src[size]
188	C edi	&dst[size]
189	C ebp	inverse
190
191	mull	PARAM_DIVISOR
192
193	movl	(%esi,%ebx,4), %eax
194	subl	%ecx, %eax
195
196	sbbl	%ecx, %ecx
197	subl	%edx, %eax
198
199	sbbl	$0, %ecx
200
201L(odd_entry):
202	imull	%ebp, %eax
203
204	movl	%eax, (%edi,%ebx,4)
205	negl	%ecx
206
207	incl	%ebx
208	jnz	L(odd_top)
209
210
211	movl	SAVE_ESI, %esi
212
213	movl	SAVE_EDI, %edi
214
215	movl	SAVE_EBP, %ebp
216
217	movl	SAVE_EBX, %ebx
218	addl	$STACK_SPACE, %esp
219
220	ret
221
222EPILOGUE()
223
224C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
225C                           mp_limb_t divisor);
226C
227
228	ALIGN(16)
229PROLOGUE(mpn_bdiv_q_1)
230deflit(`FRAME',0)
231
232	movl	PARAM_DIVISOR, %eax
233	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
234
235	movl	%esi, SAVE_ESI
236	movl	PARAM_SRC, %esi
237
238	movl	%ebx, SAVE_EBX
239	movl	PARAM_SIZE, %ebx
240
241	bsfl	%eax, %ecx		C trailing twos
242
243	movl	%ebp, SAVE_EBP
244
245	shrl	%cl, %eax		C d without twos
246
247	movl	%eax, %edx
248	shrl	%eax			C d/2 without twos
249
250	movl	%edx, PARAM_DIVISOR
251	andl	$127, %eax
252
253ifdef(`PIC',`
254	LEA(	binvert_limb_table, %ebp)
255	movzbl	(%eax,%ebp), %ebp		C inv 8 bits
256',`
257	movzbl	binvert_limb_table(%eax), %ebp	C inv 8 bits
258')
259
260	leal	(%ebp,%ebp), %eax	C 2*inv
261
262	imull	%ebp, %ebp		C inv*inv
263	imull	%edx, %ebp	C inv*inv*d
264
265	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
266	leal	(%eax,%eax), %ebp	C 2*inv
267
268	imull	%eax, %eax		C inv*inv
269	imull	%edx, %eax	C inv*inv*d
270
271	subl	%eax, %ebp		C inv = 2*inv - inv*inv*d
272
273	jmp	L(common)
274
275EPILOGUE()
276