1dnl  AMD64 mpn_mod_1_1p
2
3dnl  Contributed to the GNU project by Torbj��rn Granlund and Niels M��ller.
4
5dnl  Copyright 2009-2012, 2014 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb
36C AMD K8,K9	 6
37C AMD K10	 6
38C Intel P4	26
39C Intel core2	12.5
40C Intel NHM	11.3
41C Intel SBR	 8.4	(slowdown, old code took 8.0)
42C Intel atom	26
43C VIA nano	13
44
45define(`B2mb',   `%r10')
46define(`B2modb', `%r11')
47define(`ap',     `%rdi')
48define(`n',      `%rsi')
49define(`pre',    `%r8')
50define(`b',      `%rbx')
51
52define(`r0',     `%rbp') C r1 kept in %rax
53define(`r2',	 `%rcx')  C kept negated. Also used as shift count
54define(`t0',     `%r9')
55
56C mp_limb_t
57C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t bmodb[4])
58C                       %rdi         %rsi         %rdx                %rcx
59C The pre array contains bi, cnt, B1modb, B2modb
60C Note: This implementation needs B1modb only when cnt > 0
61
62C The iteration is almost as follows,
63C
64C   r_2 B^3 + r_1 B^2 + r_0 B + u = r_1 B2modb + (r_0 + r_2 B2mod) B + u
65C
66C where r2 is a single bit represented as a mask. But to make sure that the
67C result fits in two limbs and a bit, carry from the addition
68C
69C   r_0 + r_2 B2mod
70C
71C is handled specially. On carry, we subtract b to cancel the carry,
72C and we use instead the value
73C
74C   r_0 + B2mb (mod B)
75C
76C This addition can be issued early since it doesn't depend on r2, and it is
77C the source of the cmov in the loop.
78C
79C We have the invariant that r_2 B^2 + r_1 B + r_0 < B^2 + B b
80
81ABI_SUPPORT(DOS64)
82ABI_SUPPORT(STD64)
83
84ASM_START()
85	TEXT
86	ALIGN(16)
87PROLOGUE(mpn_mod_1_1p)
88	FUNC_ENTRY(4)
89	push	%rbp
90	push	%rbx
91	mov	%rdx, b
92	mov	%rcx, pre
93
94	mov	-8(ap, n, 8), %rax
95	cmp	$3, n
96	jnc	L(first)
97	mov	-16(ap, n, 8), r0
98	jmp	L(reduce_two)
99
100L(first):
101	C First iteration, no r2
102	mov	24(pre), B2modb
103	mul	B2modb
104	mov	-24(ap, n, 8), r0
105	add	%rax, r0
106	mov	-16(ap, n, 8), %rax
107	adc	%rdx, %rax
108	sbb	r2, r2
109	sub	$4, n
110	jc	L(reduce_three)
111
112	mov	B2modb, B2mb
113	sub	b, B2mb
114
115	ALIGN(16)
116L(top):	and	B2modb, r2
117	lea	(B2mb, r0), t0
118	mul	B2modb
119	add	r0, r2
120	mov	(ap, n, 8), r0
121	cmovc	t0, r2
122	add	%rax, r0
123	mov	r2, %rax
124	adc	%rdx, %rax
125	sbb	r2, r2
126	sub	$1, n
127	jnc	L(top)
128
129L(reduce_three):
130	C Eliminate r2
131	and	b, r2
132	sub	r2, %rax
133
134L(reduce_two):
135	mov	8(pre), R32(%rcx)
136	test	R32(%rcx), R32(%rcx)
137	jz	L(normalized)
138
139	C Unnormalized, use B1modb to reduce to size < B (b+1)
140	mulq	16(pre)
141	xor	t0, t0
142	add	%rax, r0
143	adc	%rdx, t0
144	mov	t0, %rax
145
146	C Left-shift to normalize
147ifdef(`SHLD_SLOW',`
148	shl	R8(%rcx), %rax
149	mov	r0, t0
150	neg	R32(%rcx)
151	shr	R8(%rcx), t0
152	or	t0, %rax
153	neg	R32(%rcx)
154',`
155	shld	R8(%rcx), r0, %rax
156')
157	shl	R8(%rcx), r0
158	jmp	L(udiv)
159
160L(normalized):
161	mov	%rax, t0
162	sub	b, t0
163	cmovnc	t0, %rax
164
165L(udiv):
166	lea	1(%rax), t0
167	mulq	(pre)
168	add	r0, %rax
169	adc	t0, %rdx
170	imul	b, %rdx
171	sub	%rdx, r0
172	cmp	r0, %rax
173	lea	(b, r0), %rax
174	cmovnc	r0, %rax
175	cmp	b, %rax
176	jnc	L(fix)
177L(ok):	shr	R8(%rcx), %rax
178
179	pop	%rbx
180	pop	%rbp
181	FUNC_EXIT()
182	ret
183L(fix):	sub	b, %rax
184	jmp	L(ok)
185EPILOGUE()
186
187	ALIGN(16)
188PROLOGUE(mpn_mod_1_1p_cps)
189	FUNC_ENTRY(2)
190	push	%rbp
191	bsr	%rsi, %rcx
192	push	%rbx
193	mov	%rdi, %rbx
194	push	%r12
195	xor	$63, R32(%rcx)
196	mov	%rsi, %r12
197	mov	R32(%rcx), R32(%rbp)
198	sal	R8(%rcx), %r12
199IFSTD(`	mov	%r12, %rdi	')	C pass parameter
200IFDOS(`	mov	%r12, %rcx	')	C pass parameter
201IFDOS(`	sub	$32, %rsp	')
202	ASSERT(nz, `test $15, %rsp')
203	CALL(	mpn_invert_limb)
204IFDOS(`	add	$32, %rsp	')
205	neg	%r12
206	mov	%r12, %r8
207	mov	%rax, (%rbx)		C store bi
208	mov	%rbp, 8(%rbx)		C store cnt
209	imul	%rax, %r12
210	mov	%r12, 24(%rbx)		C store B2modb
211	mov	R32(%rbp), R32(%rcx)
212	test	R32(%rcx), R32(%rcx)
213	jz	L(z)
214
215	mov	$1, R32(%rdx)
216ifdef(`SHLD_SLOW',`
217	C Destroys %rax, unlike shld. Otherwise, we could do B1modb
218	C before B2modb, and get rid of the move %r12, %r8 above.
219
220	shl	R8(%rcx), %rdx
221	neg	R32(%rcx)
222	shr	R8(%rcx), %rax
223	or	%rax, %rdx
224	neg	R32(%rcx)
225',`
226	shld	R8(%rcx), %rax, %rdx
227')
228	imul	%rdx, %r8
229	shr	R8(%rcx), %r8
230	mov	%r8, 16(%rbx)		C store B1modb
231L(z):
232	pop	%r12
233	pop	%rbx
234	pop	%rbp
235	FUNC_EXIT()
236	ret
237EPILOGUE()
238ASM_END()
239