1dnl  x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient.
2
3dnl  Copyright 1999, 2000, 2001, 2002, 2003, 2007 Free Software Foundation,
4dnl  Inc.
5dnl
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or
9dnl  modify it under the terms of the GNU Lesser General Public License as
10dnl  published by the Free Software Foundation; either version 3 of the
11dnl  License, or (at your option) any later version.
12dnl
13dnl  The GNU MP Library is distributed in the hope that it will be useful,
14dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
15dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16dnl  Lesser General Public License for more details.
17dnl
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23
24C       cycles/limb
25C 486   approx 43 maybe
26C P5        44
27C P6        39
28C P6MMX     39
29C K6        22
30C K7        42
31C P4        58
32
33
34C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
35C                         mp_srcptr src, mp_size_t size, mp_limb_t divisor);
36C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
37C                          mp_srcptr src, mp_size_t size, mp_limb_t divisor,
38C                          mp_limb_t carry);
39C
40C Divide src,size by divisor and store the quotient in dst+xsize,size.
41C Extend the division to fractional quotient limbs in dst,xsize.  Return the
42C remainder.  Either or both xsize and size can be 0.
43C
44C mpn_divrem_1c takes a carry parameter which is an initial high limb,
45C effectively one extra limb at the top of src,size.  Must have
46C carry<divisor.
47C
48C
49C Essentially the code is the same as the division based part of
50C mpn/generic/divrem_1.c, but has the advantage that we get the desired divl
51C instruction even when gcc is not being used (when longlong.h only has the
52C rather slow generic C udiv_qrnnd().
53C
54C A test is done to see if the high limb is less than the divisor, and if so
55C one less div is done.  A div is between 20 and 40 cycles on the various
56C x86s, so assuming high<divisor about half the time, then this test saves
57C half that amount.  The branch misprediction penalty on each chip is less
58C than half a div.
59C
60C
61C Notes for P5:
62C
63C It might be thought that moving the load down to pair with the store would
64C save 1 cycle, but that doesn't seem to happen in practice, and in any case
65C would be a mere 2.2% saving, so it's hardly worth bothering about.
66C
67C A mul-by-inverse might be a possibility for P5, as done in
68C mpn/x86/pentium/mod_1.asm.  The number of auxiliary instructions required
69C is a hinderance, but there could be a 10-15% speedup available.
70C
71C
72C Notes for K6:
73C
74C K6 has its own version of this code, using loop and paying attention to
75C cache line boundary crossings.  The target 20 c/l can be had with the
76C decl+jnz of the present code by pairing up the load and store in the
77C loops.  But it's considered easier not to introduce complexity just for
78C that, but instead let k6 have its own code.
79C
80
81defframe(PARAM_CARRY,  24)
82defframe(PARAM_DIVISOR,20)
83defframe(PARAM_SIZE,   16)
84defframe(PARAM_SRC,    12)
85defframe(PARAM_XSIZE,  8)
86defframe(PARAM_DST,    4)
87
88	TEXT
89	ALIGN(16)
90
91PROLOGUE(mpn_divrem_1c)
92deflit(`FRAME',0)
93
94	movl	PARAM_SIZE, %ecx
95	pushl	%edi		FRAME_pushl()
96
97	movl	PARAM_SRC, %edi
98	pushl	%esi		FRAME_pushl()
99
100	movl	PARAM_DIVISOR, %esi
101	pushl	%ebx		FRAME_pushl()
102
103	movl	PARAM_DST, %ebx
104	pushl	%ebp		FRAME_pushl()
105
106	movl	PARAM_XSIZE, %ebp
107	orl	%ecx, %ecx
108
109	movl	PARAM_CARRY, %edx
110	jz	L(fraction)
111
112	leal	-4(%ebx,%ebp,4), %ebx	C dst one limb below integer part
113	jmp	L(integer_top)
114
115EPILOGUE()
116
117
118PROLOGUE(mpn_divrem_1)
119deflit(`FRAME',0)
120
121	movl	PARAM_SIZE, %ecx
122	pushl	%edi		FRAME_pushl()
123
124	movl	PARAM_SRC, %edi
125	pushl	%esi		FRAME_pushl()
126
127	movl	PARAM_DIVISOR, %esi
128	orl	%ecx,%ecx
129
130	jz	L(size_zero)
131	pushl	%ebx		FRAME_pushl()
132
133	movl	-4(%edi,%ecx,4), %eax	C src high limb
134	xorl	%edx, %edx
135
136	movl	PARAM_DST, %ebx
137	pushl	%ebp		FRAME_pushl()
138
139	movl	PARAM_XSIZE, %ebp
140	cmpl	%esi, %eax
141
142	leal	-4(%ebx,%ebp,4), %ebx	C dst one limb below integer part
143	jae	L(integer_entry)
144
145
146	C high<divisor, so high of dst is zero, and avoid one div
147
148	movl	%edx, (%ebx,%ecx,4)
149	decl	%ecx
150
151	movl	%eax, %edx
152	jz	L(fraction)
153
154
155L(integer_top):
156	C eax	scratch (quotient)
157	C ebx	dst+4*xsize-4
158	C ecx	counter
159	C edx	scratch (remainder)
160	C esi	divisor
161	C edi	src
162	C ebp	xsize
163
164	movl	-4(%edi,%ecx,4), %eax
165L(integer_entry):
166
167	divl	%esi
168
169	movl	%eax, (%ebx,%ecx,4)
170	decl	%ecx
171	jnz	L(integer_top)
172
173
174L(fraction):
175	orl	%ebp, %ecx
176	jz	L(done)
177
178	movl	PARAM_DST, %ebx
179
180
181L(fraction_top):
182	C eax	scratch (quotient)
183	C ebx	dst
184	C ecx	counter
185	C edx	scratch (remainder)
186	C esi	divisor
187	C edi
188	C ebp
189
190	xorl	%eax, %eax
191
192	divl	%esi
193
194	movl	%eax, -4(%ebx,%ecx,4)
195	decl	%ecx
196	jnz	L(fraction_top)
197
198
199L(done):
200	popl	%ebp
201	movl	%edx, %eax
202	popl	%ebx
203	popl	%esi
204	popl	%edi
205	ret
206
207
208L(size_zero):
209deflit(`FRAME',8)
210	movl	PARAM_XSIZE, %ecx
211	xorl	%eax, %eax
212
213	movl	PARAM_DST, %edi
214
215	cld	C better safe than sorry, see mpn/x86/README
216
217	rep
218	stosl
219
220	popl	%esi
221	popl	%edi
222	ret
223EPILOGUE()
224