1dnl  x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient.
2
3dnl  Copyright 1999-2003, 2007 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C       cycles/limb
35C 486   approx 43 maybe
36C P5        44
37C P6        39
38C P6MMX     39
39C K6        22
40C K7        42
41C P4        58
42
43
44C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
45C                         mp_srcptr src, mp_size_t size, mp_limb_t divisor);
46C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
47C                          mp_srcptr src, mp_size_t size, mp_limb_t divisor,
48C                          mp_limb_t carry);
49C
50C Divide src,size by divisor and store the quotient in dst+xsize,size.
51C Extend the division to fractional quotient limbs in dst,xsize.  Return the
52C remainder.  Either or both xsize and size can be 0.
53C
54C mpn_divrem_1c takes a carry parameter which is an initial high limb,
55C effectively one extra limb at the top of src,size.  Must have
56C carry<divisor.
57C
58C
59C Essentially the code is the same as the division based part of
60C mpn/generic/divrem_1.c, but has the advantage that we get the desired divl
61C instruction even when gcc is not being used (when longlong.h only has the
62C rather slow generic C udiv_qrnnd().
63C
64C A test is done to see if the high limb is less than the divisor, and if so
65C one less div is done.  A div is between 20 and 40 cycles on the various
66C x86s, so assuming high<divisor about half the time, then this test saves
67C half that amount.  The branch misprediction penalty on each chip is less
68C than half a div.
69C
70C
71C Notes for P5:
72C
73C It might be thought that moving the load down to pair with the store would
74C save 1 cycle, but that doesn't seem to happen in practice, and in any case
75C would be a mere 2.2% saving, so it's hardly worth bothering about.
76C
77C A mul-by-inverse might be a possibility for P5, as done in
78C mpn/x86/pentium/mod_1.asm.  The number of auxiliary instructions required
79C is a hinderance, but there could be a 10-15% speedup available.
80C
81C
82C Notes for K6:
83C
84C K6 has its own version of this code, using loop and paying attention to
85C cache line boundary crossings.  The target 20 c/l can be had with the
86C decl+jnz of the present code by pairing up the load and store in the
87C loops.  But it's considered easier not to introduce complexity just for
88C that, but instead let k6 have its own code.
89C
90
91defframe(PARAM_CARRY,  24)
92defframe(PARAM_DIVISOR,20)
93defframe(PARAM_SIZE,   16)
94defframe(PARAM_SRC,    12)
95defframe(PARAM_XSIZE,  8)
96defframe(PARAM_DST,    4)
97
98	TEXT
99	ALIGN(16)
100
101PROLOGUE(mpn_divrem_1c)
102deflit(`FRAME',0)
103
104	movl	PARAM_SIZE, %ecx
105	pushl	%edi		FRAME_pushl()
106
107	movl	PARAM_SRC, %edi
108	pushl	%esi		FRAME_pushl()
109
110	movl	PARAM_DIVISOR, %esi
111	pushl	%ebx		FRAME_pushl()
112
113	movl	PARAM_DST, %ebx
114	pushl	%ebp		FRAME_pushl()
115
116	movl	PARAM_XSIZE, %ebp
117	orl	%ecx, %ecx
118
119	movl	PARAM_CARRY, %edx
120	jz	L(fraction)
121
122	leal	-4(%ebx,%ebp,4), %ebx	C dst one limb below integer part
123	jmp	L(integer_top)
124
125EPILOGUE()
126
127
128PROLOGUE(mpn_divrem_1)
129deflit(`FRAME',0)
130
131	movl	PARAM_SIZE, %ecx
132	pushl	%edi		FRAME_pushl()
133
134	movl	PARAM_SRC, %edi
135	pushl	%esi		FRAME_pushl()
136
137	movl	PARAM_DIVISOR, %esi
138	orl	%ecx,%ecx
139
140	jz	L(size_zero)
141	pushl	%ebx		FRAME_pushl()
142
143	movl	-4(%edi,%ecx,4), %eax	C src high limb
144	xorl	%edx, %edx
145
146	movl	PARAM_DST, %ebx
147	pushl	%ebp		FRAME_pushl()
148
149	movl	PARAM_XSIZE, %ebp
150	cmpl	%esi, %eax
151
152	leal	-4(%ebx,%ebp,4), %ebx	C dst one limb below integer part
153	jae	L(integer_entry)
154
155
156	C high<divisor, so high of dst is zero, and avoid one div
157
158	movl	%edx, (%ebx,%ecx,4)
159	decl	%ecx
160
161	movl	%eax, %edx
162	jz	L(fraction)
163
164
165L(integer_top):
166	C eax	scratch (quotient)
167	C ebx	dst+4*xsize-4
168	C ecx	counter
169	C edx	scratch (remainder)
170	C esi	divisor
171	C edi	src
172	C ebp	xsize
173
174	movl	-4(%edi,%ecx,4), %eax
175L(integer_entry):
176
177	divl	%esi
178
179	movl	%eax, (%ebx,%ecx,4)
180	decl	%ecx
181	jnz	L(integer_top)
182
183
184L(fraction):
185	orl	%ebp, %ecx
186	jz	L(done)
187
188	movl	PARAM_DST, %ebx
189
190
191L(fraction_top):
192	C eax	scratch (quotient)
193	C ebx	dst
194	C ecx	counter
195	C edx	scratch (remainder)
196	C esi	divisor
197	C edi
198	C ebp
199
200	xorl	%eax, %eax
201
202	divl	%esi
203
204	movl	%eax, -4(%ebx,%ecx,4)
205	decl	%ecx
206	jnz	L(fraction_top)
207
208
209L(done):
210	popl	%ebp
211	movl	%edx, %eax
212	popl	%ebx
213	popl	%esi
214	popl	%edi
215	ret
216
217
218L(size_zero):
219deflit(`FRAME',8)
220	movl	PARAM_XSIZE, %ecx
221	xorl	%eax, %eax
222
223	movl	PARAM_DST, %edi
224
225	cld	C better safe than sorry, see mpn/x86/README
226
227	rep
228	stosl
229
230	popl	%esi
231	popl	%edi
232	ret
233EPILOGUE()
234