1dnl  AMD K7 mpn_divexact_1 -- mpn by limb exact division.
2
3dnl  Copyright 2001, 2002, 2004, 2007 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C          cycles/limb
24C Athlon:     11.0
25C Hammer:      9.0
26
27
28C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
29C                      mp_limb_t divisor);
30C
31C The dependent chain is mul+imul+sub for 11 cycles and that speed is
32C achieved with no special effort.  The load and shrld latencies are hidden
33C by out of order execution.
34C
35C It's a touch faster on size==1 to use the mul-by-inverse than divl.
36
37defframe(PARAM_DIVISOR,16)
38defframe(PARAM_SIZE,   12)
39defframe(PARAM_SRC,    8)
40defframe(PARAM_DST,    4)
41
42defframe(SAVE_EBX,     -4)
43defframe(SAVE_ESI,     -8)
44defframe(SAVE_EDI,    -12)
45defframe(SAVE_EBP,    -16)
46defframe(VAR_INVERSE, -20)
47defframe(VAR_DST_END, -24)
48
49deflit(STACK_SPACE, 24)
50
51	TEXT
52
53	ALIGN(16)
54PROLOGUE(mpn_divexact_1)
55deflit(`FRAME',0)
56
57	movl	PARAM_DIVISOR, %eax
58	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
59	movl	$-1, %ecx		C shift count
60
61	movl	%ebp, SAVE_EBP
62	movl	PARAM_SIZE, %ebp
63
64	movl	%esi, SAVE_ESI
65	movl	%edi, SAVE_EDI
66
67	C If there's usually only one or two trailing zero bits then this
68	C should be faster than bsfl.
69L(strip_twos):
70	incl	%ecx
71	shrl	%eax
72	jnc	L(strip_twos)
73
74	movl	%ebx, SAVE_EBX
75	leal	1(%eax,%eax), %ebx	C d without twos
76	andl	$127, %eax		C d/2, 7 bits
77
78ifdef(`PIC',`
79	LEA(	binvert_limb_table, %edx)
80	movzbl	(%eax,%edx), %eax		C inv 8 bits
81',`
82	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
83')
84
85	leal	(%eax,%eax), %edx	C 2*inv
86	movl	%ebx, PARAM_DIVISOR	C d without twos
87
88	imull	%eax, %eax		C inv*inv
89
90	movl	PARAM_SRC, %esi
91	movl	PARAM_DST, %edi
92
93	imull	%ebx, %eax		C inv*inv*d
94
95	subl	%eax, %edx		C inv = 2*inv - inv*inv*d
96	leal	(%edx,%edx), %eax	C 2*inv
97
98	imull	%edx, %edx		C inv*inv
99
100	leal	(%esi,%ebp,4), %esi	C src end
101	leal	(%edi,%ebp,4), %edi	C dst end
102	negl	%ebp			C -size
103
104	imull	%ebx, %edx		C inv*inv*d
105
106	subl	%edx, %eax		C inv = 2*inv - inv*inv*d
107
108	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
109	pushl	%eax	FRAME_pushl()
110	imull	PARAM_DIVISOR, %eax
111	cmpl	$1, %eax
112	popl	%eax	FRAME_popl()')
113
114	movl	%eax, VAR_INVERSE
115	movl	(%esi,%ebp,4), %eax	C src[0]
116
117	incl	%ebp
118	jz	L(one)
119
120	movl	(%esi,%ebp,4), %edx	C src[1]
121
122	shrdl(	%cl, %edx, %eax)
123
124	movl	%edi, VAR_DST_END
125	xorl	%ebx, %ebx
126	jmp	L(entry)
127
128	ALIGN(8)
129L(top):
130	C eax	q
131	C ebx	carry bit, 0 or 1
132	C ecx	shift
133	C edx
134	C esi	src end
135	C edi	dst end
136	C ebp	counter, limbs, negative
137
138	mull	PARAM_DIVISOR		C carry limb in edx
139
140	movl	-4(%esi,%ebp,4), %eax
141	movl	(%esi,%ebp,4), %edi
142
143	shrdl(	%cl, %edi, %eax)
144
145	subl	%ebx, %eax		C apply carry bit
146	setc	%bl
147	movl	VAR_DST_END, %edi
148
149	subl	%edx, %eax		C apply carry limb
150	adcl	$0, %ebx
151
152L(entry):
153	imull	VAR_INVERSE, %eax
154
155	movl	%eax, -4(%edi,%ebp,4)
156	incl	%ebp
157	jnz	L(top)
158
159
160	mull	PARAM_DIVISOR		C carry limb in edx
161
162	movl	-4(%esi), %eax		C src high limb
163	shrl	%cl, %eax
164	movl	SAVE_ESI, %esi
165
166	subl	%ebx, %eax		C apply carry bit
167	movl	SAVE_EBX, %ebx
168	movl	SAVE_EBP, %ebp
169
170	subl	%edx, %eax		C apply carry limb
171
172	imull	VAR_INVERSE, %eax
173
174	movl	%eax, -4(%edi)
175	movl	SAVE_EDI, %edi
176	addl	$STACK_SPACE, %esp
177
178	ret
179
180
181L(one):
182	shrl	%cl, %eax
183	movl	SAVE_ESI, %esi
184	movl	SAVE_EBX, %ebx
185
186	imull	VAR_INVERSE, %eax
187
188	movl	SAVE_EBP, %ebp
189	movl	%eax, -4(%edi)
190
191	movl	SAVE_EDI, %edi
192	addl	$STACK_SPACE, %esp
193
194	ret
195
196EPILOGUE()
197