dive_1.asm revision 1.1.1.1
1dnl  x86 mpn_divexact_1 -- mpn by limb exact division.
2
3dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C     cycles/limb
24C P54    30.0
25C P55    29.0
26C P6     13.0 odd divisor, 12.0 even (strangely)
27C K6     14.0
28C K7     12.0
29C P4     42.0
30
31
32C mp_limb_t mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
33C                           mp_limb_t divisor);
34C
35
36defframe(PARAM_DIVISOR,16)
37defframe(PARAM_SIZE,   12)
38defframe(PARAM_SRC,    8)
39defframe(PARAM_DST,    4)
40
41dnl  re-use parameter space
42define(VAR_INVERSE,`PARAM_SRC')
43
44	TEXT
45
46	ALIGN(16)
47PROLOGUE(mpn_divexact_1)
48deflit(`FRAME',0)
49
50	movl	PARAM_DIVISOR, %eax
51	pushl	%ebp	FRAME_pushl()
52
53	movl	PARAM_SIZE, %ebp
54	pushl	%edi	FRAME_pushl()
55
56	pushl	%ebx	FRAME_pushl()
57	movl	$-1, %ecx		C shift count
58
59	pushl	%esi	FRAME_pushl()
60
61L(strip_twos):
62	incl	%ecx
63
64	shrl	%eax
65	jnc	L(strip_twos)
66
67	leal	1(%eax,%eax), %ebx	C d without twos
68	andl	$127, %eax		C d/2, 7 bits
69
70ifdef(`PIC',`
71	LEA(	binvert_limb_table, %edx)
72	movzbl	(%eax,%edx), %eax		C inv 8 bits
73',`
74	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
75')
76
77	leal	(%eax,%eax), %edx	C 2*inv
78	movl	%ebx, PARAM_DIVISOR	C d without twos
79
80	imull	%eax, %eax		C inv*inv
81
82	movl	PARAM_SRC, %esi
83	movl	PARAM_DST, %edi
84
85	imull	%ebx, %eax		C inv*inv*d
86
87	subl	%eax, %edx		C inv = 2*inv - inv*inv*d
88	leal	(%edx,%edx), %eax	C 2*inv
89
90	imull	%edx, %edx		C inv*inv
91
92	leal	(%esi,%ebp,4), %esi	C src end
93	leal	(%edi,%ebp,4), %edi	C dst end
94	negl	%ebp			C -size
95
96	imull	%ebx, %edx		C inv*inv*d
97
98	subl	%edx, %eax		C inv = 2*inv - inv*inv*d
99
100	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
101	pushl	%eax	FRAME_pushl()
102	imull	PARAM_DIVISOR, %eax
103	cmpl	$1, %eax
104	popl	%eax	FRAME_popl()')
105
106	movl	%eax, VAR_INVERSE
107	movl	(%esi,%ebp,4), %eax	C src[0]
108
109	xorl	%ebx, %ebx
110	xorl	%edx, %edx
111
112	incl	%ebp
113	jz	L(one)
114
115	movl	(%esi,%ebp,4), %edx	C src[1]
116
117	shrdl(	%cl, %edx, %eax)
118
119	movl	VAR_INVERSE, %edx
120	jmp	L(entry)
121
122
123	ALIGN(8)
124	nop	C k6 code alignment
125	nop
126L(top):
127	C eax	q
128	C ebx	carry bit, 0 or -1
129	C ecx	shift
130	C edx	carry limb
131	C esi	src end
132	C edi	dst end
133	C ebp	counter, limbs, negative
134
135	movl	-4(%esi,%ebp,4), %eax
136	subl	%ebx, %edx		C accumulate carry bit
137
138	movl	(%esi,%ebp,4), %ebx
139
140	shrdl(	%cl, %ebx, %eax)
141
142	subl	%edx, %eax		C apply carry limb
143	movl	VAR_INVERSE, %edx
144
145	sbbl	%ebx, %ebx
146
147L(entry):
148	imull	%edx, %eax
149
150	movl	%eax, -4(%edi,%ebp,4)
151	movl	PARAM_DIVISOR, %edx
152
153	mull	%edx
154
155	incl	%ebp
156	jnz	L(top)
157
158
159	movl	-4(%esi), %eax		C src high limb
160L(one):
161	shrl	%cl, %eax
162	popl	%esi	FRAME_popl()
163
164	addl	%ebx, %eax		C apply carry bit
165	popl	%ebx	FRAME_popl()
166
167	subl	%edx, %eax		C apply carry limb
168
169	imull	VAR_INVERSE, %eax
170
171	movl	%eax, -4(%edi)
172
173	popl	%edi
174	popl	%ebp
175
176	ret
177
178EPILOGUE()
179