divrem_1.asm revision 1.1.1.1
1dnl  Intel Pentium-4 mpn_divrem_1 -- mpn by limb division.
2
3dnl  Copyright 1999, 2000, 2001, 2002, 2003, 2004 Free Software Foundation,
4dnl  Inc.
5dnl
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or
9dnl  modify it under the terms of the GNU Lesser General Public License as
10dnl  published by the Free Software Foundation; either version 3 of the
11dnl  License, or (at your option) any later version.
12dnl
13dnl  The GNU MP Library is distributed in the hope that it will be useful,
14dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
15dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16dnl  Lesser General Public License for more details.
17dnl
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23
24C P4: 32 cycles/limb integer part, 30 cycles/limb fraction part.
25
26
27C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
28C                         mp_srcptr src, mp_size_t size,
29C                         mp_limb_t divisor);
30C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
31C                          mp_srcptr src, mp_size_t size,
32C                          mp_limb_t divisor, mp_limb_t carry);
33C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
34C                                mp_srcptr src, mp_size_t size,
35C                                mp_limb_t divisor, mp_limb_t inverse,
36C                                unsigned shift);
37C
38C Algorithm:
39C
40C The method and nomenclature follow part 8 of "Division by Invariant
41C Integers using Multiplication" by Granlund and Montgomery, reference in
42C gmp.texi.
43C
44C "m" is written for what is m' in the paper, and "d" for d_norm, which
45C won't cause any confusion since it's only the normalized divisor that's of
46C any use in the code.  "b" is written for 2^N, the size of a limb, N being
47C 32 here.
48C
49C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
50C "n-d - q1*d".  This rearrangement gives the same two-limb answer but lets
51C us have just a psubq on the dependent chain.
52C
53C For reference, the way the k7 code uses "n-(q1+1)*d" would not suit here,
54C detecting an overflow of q1+1 when q1=0xFFFFFFFF would cost too much.
55C
56C Notes:
57C
58C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
59C limb is less than the divisor.  mpn_divrem_1c doesn't check for a zero
60C carry, since in normal circumstances that will be a very rare event.
61C
62C The test for skipping a division is branch free (once size>=1 is tested).
63C The store to the destination high limb is 0 when a divide is skipped, or
64C if it's not skipped then a copy of the src high limb is stored.  The
65C latter is in case src==dst.
66C
67C There's a small bias towards expecting xsize==0, by having code for
68C xsize==0 in a straight line and xsize!=0 under forward jumps.
69C
70C Enhancements:
71C
72C The loop measures 32 cycles, but the dependent chain would suggest it
73C could be done with 30.  Not sure where to start looking for the extras.
74C
75C Alternatives:
76C
77C If the divisor is normalized (high bit set) then a division step can
78C always be skipped, since the high destination limb is always 0 or 1 in
79C that case.  It doesn't seem worth checking for this though, since it
80C probably occurs infrequently.
81
82
83dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
84dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
85dnl
86dnl  The inverse takes about 80-90 cycles to calculate, but after that the
87dnl  multiply is 32 c/l versus division at about 58 c/l.
88dnl
89dnl  At 4 limbs the div is a touch faster than the mul (and of course
90dnl  simpler), so start the mul from 5 limbs.
91
92deflit(MUL_THRESHOLD, 5)
93
94
95defframe(PARAM_PREINV_SHIFT,   28)  dnl mpn_preinv_divrem_1
96defframe(PARAM_PREINV_INVERSE, 24)  dnl mpn_preinv_divrem_1
97defframe(PARAM_CARRY,  24)          dnl mpn_divrem_1c
98defframe(PARAM_DIVISOR,20)
99defframe(PARAM_SIZE,   16)
100defframe(PARAM_SRC,    12)
101defframe(PARAM_XSIZE,  8)
102defframe(PARAM_DST,    4)
103
104dnl  re-use parameter space
105define(SAVE_ESI,`PARAM_SIZE')
106define(SAVE_EBP,`PARAM_SRC')
107define(SAVE_EDI,`PARAM_DIVISOR')
108define(SAVE_EBX,`PARAM_DST')
109
110	TEXT
111
112	ALIGN(16)
113PROLOGUE(mpn_preinv_divrem_1)
114deflit(`FRAME',0)
115
116	movl	PARAM_SIZE, %ecx
117	xorl	%edx, %edx		C carry if can't skip a div
118
119	movl	%esi, SAVE_ESI
120	movl	PARAM_SRC, %esi
121
122	movl	%ebp, SAVE_EBP
123	movl	PARAM_DIVISOR, %ebp
124
125	movl	%edi, SAVE_EDI
126	movl	PARAM_DST, %edi
127
128	movl	-4(%esi,%ecx,4), %eax	C src high limb
129
130	movl	%ebx, SAVE_EBX
131	movl	PARAM_XSIZE, %ebx
132
133	movd	PARAM_PREINV_INVERSE, %mm4
134
135	movd	PARAM_PREINV_SHIFT, %mm7  C l
136	cmpl	%ebp, %eax		C high cmp divisor
137
138	cmovc(	%eax, %edx)		C high is carry if high<divisor
139	movd	%edx, %mm0		C carry
140
141	movd	%edx, %mm1		C carry
142	movl	$0, %edx
143
144	movd	%ebp, %mm5		C d
145	cmovnc(	%eax, %edx)		C 0 if skip div, src high if not
146					C (the latter in case src==dst)
147	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
148
149	movl	%edx, (%edi,%ecx,4)	C dst high limb
150	sbbl	$0, %ecx		C skip one division if high<divisor
151	movl	$32, %eax
152
153	subl	PARAM_PREINV_SHIFT, %eax
154	psllq	%mm7, %mm5		C d normalized
155	leal	(%edi,%ecx,4), %edi	C &dst[xsize+size-1]
156	leal	-4(%esi,%ecx,4), %esi	C &src[size-1]
157
158	movd	%eax, %mm6		C 32-l
159	jmp	L(start_preinv)
160
161EPILOGUE()
162
163
164	ALIGN(16)
165PROLOGUE(mpn_divrem_1c)
166deflit(`FRAME',0)
167
168	movl	PARAM_CARRY, %edx
169
170	movl	PARAM_SIZE, %ecx
171
172	movl	%esi, SAVE_ESI
173	movl	PARAM_SRC, %esi
174
175	movl	%ebp, SAVE_EBP
176	movl	PARAM_DIVISOR, %ebp
177
178	movl	%edi, SAVE_EDI
179	movl	PARAM_DST, %edi
180
181	movl	%ebx, SAVE_EBX
182	movl	PARAM_XSIZE, %ebx
183
184	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
185	jmp	L(start_1c)
186
187EPILOGUE()
188
189
190	ALIGN(16)
191PROLOGUE(mpn_divrem_1)
192deflit(`FRAME',0)
193
194	movl	PARAM_SIZE, %ecx
195	xorl	%edx, %edx		C initial carry (if can't skip a div)
196
197	movl	%esi, SAVE_ESI
198	movl	PARAM_SRC, %esi
199
200	movl	%ebp, SAVE_EBP
201	movl	PARAM_DIVISOR, %ebp
202
203	movl	%edi, SAVE_EDI
204	movl	PARAM_DST, %edi
205
206	movl	%ebx, SAVE_EBX
207	movl	PARAM_XSIZE, %ebx
208	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
209
210	orl	%ecx, %ecx		C size
211	jz	L(no_skip_div)		C if size==0
212	movl	-4(%esi,%ecx,4), %eax	C src high limb
213
214	cmpl	%ebp, %eax		C high cmp divisor
215
216	cmovnc(	%eax, %edx)		C 0 if skip div, src high if not
217	movl	%edx, (%edi,%ecx,4)	C dst high limb
218
219	movl	$0, %edx
220	cmovc(	%eax, %edx)		C high is carry if high<divisor
221
222	sbbl	$0, %ecx		C size-1 if high<divisor
223L(no_skip_div):
224
225
226L(start_1c):
227	C eax
228	C ebx	xsize
229	C ecx	size
230	C edx	carry
231	C esi	src
232	C edi	&dst[xsize-1]
233	C ebp	divisor
234
235	leal	(%ebx,%ecx), %eax	C size+xsize
236	leal	-4(%esi,%ecx,4), %esi	C &src[size-1]
237	leal	(%edi,%ecx,4), %edi	C &dst[size+xsize-1]
238
239	cmpl	$MUL_THRESHOLD, %eax
240	jae	L(mul_by_inverse)
241
242
243	orl	%ecx, %ecx
244	jz	L(divide_no_integer)	C if size==0
245
246L(divide_integer):
247	C eax	scratch (quotient)
248	C ebx	xsize
249	C ecx	counter
250	C edx	carry
251	C esi	src, decrementing
252	C edi	dst, decrementing
253	C ebp	divisor
254
255	movl	(%esi), %eax
256	subl	$4, %esi
257
258	divl	%ebp
259
260	movl	%eax, (%edi)
261	subl	$4, %edi
262
263	subl	$1, %ecx
264	jnz	L(divide_integer)
265
266
267L(divide_no_integer):
268	orl	%ebx, %ebx
269	jnz	L(divide_fraction)	C if xsize!=0
270
271L(divide_done):
272	movl	SAVE_ESI, %esi
273	movl	SAVE_EDI, %edi
274	movl	SAVE_EBX, %ebx
275	movl	SAVE_EBP, %ebp
276	movl	%edx, %eax
277	ret
278
279
280L(divide_fraction):
281	C eax	scratch (quotient)
282	C ebx	counter
283	C ecx
284	C edx	carry
285	C esi
286	C edi	dst, decrementing
287	C ebp	divisor
288
289	movl	$0, %eax
290
291	divl	%ebp
292
293	movl	%eax, (%edi)
294	subl	$4, %edi
295
296	subl	$1, %ebx
297	jnz	L(divide_fraction)
298
299	jmp	L(divide_done)
300
301
302
303C -----------------------------------------------------------------------------
304
305L(mul_by_inverse):
306	C eax
307	C ebx	xsize
308	C ecx	size
309	C edx	carry
310	C esi	&src[size-1]
311	C edi	&dst[size+xsize-1]
312	C ebp	divisor
313
314	bsrl	%ebp, %eax		C 31-l
315	movd	%edx, %mm0		C carry
316	movd	%edx, %mm1		C carry
317	movl	%ecx, %edx		C size
318	movl	$31, %ecx
319
320	C
321
322	xorl	%eax, %ecx		C l = leading zeros on d
323	addl	$1, %eax
324
325	shll	%cl, %ebp		C d normalized
326	movd	%ecx, %mm7		C l
327	movl	%edx, %ecx		C size
328
329	movd	%eax, %mm6		C 32-l
330	movl	$-1, %edx
331	movl	$-1, %eax
332
333	C
334
335	subl	%ebp, %edx		C (b-d)-1 so  edx:eax = b*(b-d)-1
336
337	divl	%ebp			C floor (b*(b-d)-1 / d)
338	movd	%ebp, %mm5		C d
339
340	C
341
342	movd	%eax, %mm4		C m
343
344
345L(start_preinv):
346	C eax	inverse
347	C ebx	xsize
348	C ecx	size
349	C edx
350	C esi	&src[size-1]
351	C edi	&dst[size+xsize-1]
352	C ebp
353	C
354	C mm0	carry
355	C mm1	carry
356	C mm2
357	C mm4	m
358	C mm5	d
359	C mm6	31-l
360	C mm7	l
361
362	psllq	%mm7, %mm0		C n2 = carry << l, for size==0
363
364	subl	$1, %ecx
365	jb	L(integer_none)
366
367	movd	(%esi), %mm0		C src high limb
368	punpckldq %mm1, %mm0
369	psrlq	%mm6, %mm0		C n2 = high (carry:srchigh << l)
370	jz	L(integer_last)
371
372
373C The dependent chain here consists of
374C
375C	2   paddd    n1+n2
376C	8   pmuludq  m*(n1+n2)
377C	2   paddq    n2:nadj + m*(n1+n2)
378C	2   psrlq    q1
379C	8   pmuludq  d*q1
380C	2   psubq    (n-d)-q1*d
381C	2   psrlq    high n-(q1+1)*d mask
382C	2   pand     d masked
383C	2   paddd    n2+d addback
384C	--
385C	30
386C
387C But it seems to run at 32 cycles, so presumably there's something else
388C going on.
389
390	ALIGN(16)
391L(integer_top):
392	C eax
393	C ebx
394	C ecx	counter, size-1 to 0
395	C edx
396	C esi	src, decrementing
397	C edi	dst, decrementing
398	C
399	C mm0	n2
400	C mm4	m
401	C mm5	d
402	C mm6	32-l
403	C mm7	l
404
405	ASSERT(b,`C n2<d
406	 movd	%mm0, %eax
407	 movd	%mm5, %edx
408	 cmpl	%edx, %eax')
409
410	movd	-4(%esi), %mm1		C next src limbs
411	movd	(%esi), %mm2
412	leal	-4(%esi), %esi
413
414	punpckldq %mm2, %mm1
415	psrlq	%mm6, %mm1		C n10
416
417	movq	%mm1, %mm2		C n10
418	movq	%mm1, %mm3		C n10
419	psrad	$31, %mm1		C -n1
420	pand	%mm5, %mm1		C -n1 & d
421	paddd	%mm2, %mm1		C nadj = n10+(-n1&d), ignore overflow
422
423	psrld	$31, %mm2		C n1
424	paddd	%mm0, %mm2		C n2+n1
425	punpckldq %mm0, %mm1		C n2:nadj
426
427	pmuludq	%mm4, %mm2		C m*(n2+n1)
428
429	C
430
431	paddq	%mm2, %mm1		C n2:nadj + m*(n2+n1)
432	pxor	%mm2, %mm2		C break dependency, saves 4 cycles
433	pcmpeqd	%mm2, %mm2		C FF...FF
434	psrlq	$63, %mm2		C 1
435
436	psrlq	$32, %mm1		C q1 = high(n2:nadj + m*(n2+n1))
437
438	paddd	%mm1, %mm2		C q1+1
439	pmuludq	%mm5, %mm1		C q1*d
440
441	punpckldq %mm0, %mm3		C n = n2:n10
442	pxor	%mm0, %mm0
443
444	psubq	%mm5, %mm3		C n - d
445
446	C
447
448	psubq	%mm1, %mm3		C n - (q1+1)*d
449
450	por	%mm3, %mm0		C copy remainder -> new n2
451	psrlq	$32, %mm3		C high n - (q1+1)*d, 0 or -1
452
453	ASSERT(be,`C 0 or -1
454	 movd	%mm3, %eax
455	 addl	$1, %eax
456	 cmpl	$1, %eax')
457
458	paddd	%mm3, %mm2		C q
459	pand	%mm5, %mm3		C mask & d
460
461	paddd	%mm3, %mm0		C addback if necessary
462	movd	%mm2, (%edi)
463	leal	-4(%edi), %edi
464
465	subl	$1, %ecx
466	ja	L(integer_top)
467
468
469L(integer_last):
470	C eax
471	C ebx	xsize
472	C ecx
473	C edx
474	C esi	&src[0]
475	C edi	&dst[xsize]
476	C
477	C mm0	n2
478	C mm4	m
479	C mm5	d
480	C mm6
481	C mm7	l
482
483	ASSERT(b,`C n2<d
484	 movd	%mm0, %eax
485	 movd	%mm5, %edx
486	 cmpl	%edx, %eax')
487
488	movd	(%esi), %mm1		C src[0]
489	psllq	%mm7, %mm1		C n10
490
491	movq	%mm1, %mm2		C n10
492	movq	%mm1, %mm3		C n10
493	psrad	$31, %mm1		C -n1
494	pand	%mm5, %mm1		C -n1 & d
495	paddd	%mm2, %mm1		C nadj = n10+(-n1&d), ignore overflow
496
497	psrld	$31, %mm2		C n1
498	paddd	%mm0, %mm2		C n2+n1
499	punpckldq %mm0, %mm1		C n2:nadj
500
501	pmuludq	%mm4, %mm2		C m*(n2+n1)
502
503	C
504
505	paddq	%mm2, %mm1		C n2:nadj + m*(n2+n1)
506	pcmpeqd	%mm2, %mm2		C FF...FF
507	psrlq	$63, %mm2		C 1
508
509	psrlq	$32, %mm1		C q1 = high(n2:nadj + m*(n2+n1))
510	paddd	%mm1, %mm2		C q1
511
512	pmuludq	%mm5, %mm1		C q1*d
513	punpckldq %mm0, %mm3		C n
514	psubq	%mm5, %mm3		C n - d
515	pxor	%mm0, %mm0
516
517	C
518
519	psubq	%mm1, %mm3		C n - (q1+1)*d
520
521	por	%mm3, %mm0		C remainder -> n2
522	psrlq	$32, %mm3		C high n - (q1+1)*d, 0 or -1
523
524	ASSERT(be,`C 0 or -1
525	 movd	%mm3, %eax
526	 addl	$1, %eax
527	 cmpl	$1, %eax')
528
529	paddd	%mm3, %mm2		C q
530	pand	%mm5, %mm3		C mask & d
531
532	paddd	%mm3, %mm0		C addback if necessary
533	movd	%mm2, (%edi)
534	leal	-4(%edi), %edi
535
536
537L(integer_none):
538	C eax
539	C ebx	xsize
540
541	orl	%ebx, %ebx
542	jnz	L(fraction_some)	C if xsize!=0
543
544
545L(fraction_done):
546	movl	SAVE_EBP, %ebp
547	psrld	%mm7, %mm0		C remainder
548
549	movl	SAVE_EDI, %edi
550	movd	%mm0, %eax
551
552	movl	SAVE_ESI, %esi
553	movl	SAVE_EBX, %ebx
554	emms
555	ret
556
557
558
559C -----------------------------------------------------------------------------
560C
561
562L(fraction_some):
563	C eax
564	C ebx	xsize
565	C ecx
566	C edx
567	C esi
568	C edi	&dst[xsize-1]
569	C ebp
570
571
572L(fraction_top):
573	C eax
574	C ebx	counter, xsize iterations
575	C ecx
576	C edx
577	C esi	src, decrementing
578	C edi	dst, decrementing
579	C
580	C mm0	n2
581	C mm4	m
582	C mm5	d
583	C mm6	32-l
584	C mm7	l
585
586	ASSERT(b,`C n2<d
587	 movd	%mm0, %eax
588	 movd	%mm5, %edx
589	 cmpl	%edx, %eax')
590
591	movq	%mm0, %mm1		C n2
592	pmuludq	%mm4, %mm0		C m*n2
593
594	pcmpeqd	%mm2, %mm2
595	psrlq	$63, %mm2
596
597	C
598
599	psrlq	$32, %mm0		C high(m*n2)
600
601	paddd	%mm1, %mm0		C q1 = high(n2:0 + m*n2)
602
603	paddd	%mm0, %mm2		C q1+1
604	pmuludq	%mm5, %mm0		C q1*d
605
606	psllq	$32, %mm1		C n = n2:0
607	psubq	%mm5, %mm1		C n - d
608
609	C
610
611	psubq	%mm0, %mm1		C r = n - (q1+1)*d
612	pxor	%mm0, %mm0
613
614	por	%mm1, %mm0		C r -> n2
615	psrlq	$32, %mm1		C high n - (q1+1)*d, 0 or -1
616
617	ASSERT(be,`C 0 or -1
618	 movd	%mm1, %eax
619	 addl	$1, %eax
620	 cmpl	$1, %eax')
621
622	paddd	%mm1, %mm2		C q
623	pand	%mm5, %mm1		C mask & d
624
625	paddd	%mm1, %mm0		C addback if necessary
626	movd	%mm2, (%edi)
627	leal	-4(%edi), %edi
628
629	subl	$1, %ebx
630	jne	L(fraction_top)
631
632
633	jmp	L(fraction_done)
634
635EPILOGUE()
636