1dnl  x86 mpn_bdiv_q_1 -- mpn by limb exact division.
2
3dnl  Rearranged from mpn/x86/dive_1.asm by Marco Bodrato.
4
5dnl  Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35
36C     cycles/limb
37C P54    30.0
38C P55    29.0
39C P6     13.0 odd divisor, 12.0 even (strangely)
40C K6     14.0
41C K7     12.0
42C P4     42.0
43
44MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
45
46defframe(PARAM_SHIFT,  24)
47defframe(PARAM_INVERSE,20)
48defframe(PARAM_DIVISOR,16)
49defframe(PARAM_SIZE,   12)
50defframe(PARAM_SRC,    8)
51defframe(PARAM_DST,    4)
52
53dnl  re-use parameter space
54define(VAR_INVERSE,`PARAM_SRC')
55
56	TEXT
57
58C mp_limb_t
59C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
60C		    mp_limb_t inverse, int shift)
61
62	ALIGN(16)
63PROLOGUE(mpn_pi1_bdiv_q_1)
64deflit(`FRAME',0)
65
66	movl	PARAM_SHIFT, %ecx
67	pushl	%ebp	FRAME_pushl()
68
69	movl	PARAM_INVERSE, %eax
70	movl	PARAM_SIZE, %ebp
71	pushl	%ebx	FRAME_pushl()
72L(common):
73	pushl	%edi	FRAME_pushl()
74	pushl	%esi	FRAME_pushl()
75
76	movl	PARAM_SRC, %esi
77	movl	PARAM_DST, %edi
78
79	leal	(%esi,%ebp,4), %esi	C src end
80	leal	(%edi,%ebp,4), %edi	C dst end
81	negl	%ebp			C -size
82
83	movl	%eax, VAR_INVERSE
84	movl	(%esi,%ebp,4), %eax	C src[0]
85
86	xorl	%ebx, %ebx
87	xorl	%edx, %edx
88
89	incl	%ebp
90	jz	L(one)
91
92	movl	(%esi,%ebp,4), %edx	C src[1]
93
94	shrdl(	%cl, %edx, %eax)
95
96	movl	VAR_INVERSE, %edx
97	jmp	L(entry)
98
99
100	ALIGN(8)
101	nop	C k6 code alignment
102	nop
103L(top):
104	C eax	q
105	C ebx	carry bit, 0 or -1
106	C ecx	shift
107	C edx	carry limb
108	C esi	src end
109	C edi	dst end
110	C ebp	counter, limbs, negative
111
112	movl	-4(%esi,%ebp,4), %eax
113	subl	%ebx, %edx		C accumulate carry bit
114
115	movl	(%esi,%ebp,4), %ebx
116
117	shrdl(	%cl, %ebx, %eax)
118
119	subl	%edx, %eax		C apply carry limb
120	movl	VAR_INVERSE, %edx
121
122	sbbl	%ebx, %ebx
123
124L(entry):
125	imull	%edx, %eax
126
127	movl	%eax, -4(%edi,%ebp,4)
128	movl	PARAM_DIVISOR, %edx
129
130	mull	%edx
131
132	incl	%ebp
133	jnz	L(top)
134
135
136	movl	-4(%esi), %eax		C src high limb
137L(one):
138	shrl	%cl, %eax
139	popl	%esi	FRAME_popl()
140
141	addl	%ebx, %eax		C apply carry bit
142
143	subl	%edx, %eax		C apply carry limb
144
145	imull	VAR_INVERSE, %eax
146
147	movl	%eax, -4(%edi)
148
149	popl	%edi
150	popl	%ebx
151	popl	%ebp
152
153	ret
154
155EPILOGUE()
156
157C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
158C                           mp_limb_t divisor);
159C
160
161	ALIGN(16)
162PROLOGUE(mpn_bdiv_q_1)
163deflit(`FRAME',0)
164
165	movl	PARAM_DIVISOR, %eax
166	pushl	%ebp	FRAME_pushl()
167
168	movl	$-1, %ecx		C shift count
169	movl	PARAM_SIZE, %ebp
170
171	pushl	%ebx	FRAME_pushl()
172
173L(strip_twos):
174	incl	%ecx
175
176	shrl	%eax
177	jnc	L(strip_twos)
178
179	leal	1(%eax,%eax), %ebx	C d without twos
180	andl	$127, %eax		C d/2, 7 bits
181
182ifdef(`PIC',`
183	LEA(	binvert_limb_table, %edx)
184	movzbl	(%eax,%edx), %eax		C inv 8 bits
185',`
186	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
187')
188
189	leal	(%eax,%eax), %edx	C 2*inv
190	movl	%ebx, PARAM_DIVISOR	C d without twos
191	imull	%eax, %eax		C inv*inv
192	imull	%ebx, %eax		C inv*inv*d
193	subl	%eax, %edx		C inv = 2*inv - inv*inv*d
194
195	leal	(%edx,%edx), %eax	C 2*inv
196	imull	%edx, %edx		C inv*inv
197	imull	%ebx, %edx		C inv*inv*d
198	subl	%edx, %eax		C inv = 2*inv - inv*inv*d
199
200	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
201	pushl	%eax	FRAME_pushl()
202	imull	PARAM_DIVISOR, %eax
203	cmpl	$1, %eax
204	popl	%eax	FRAME_popl()')
205
206	jmp	L(common)
207EPILOGUE()
208ASM_END()
209