1dnl  x86-32 mpn_mod_1s_4p for Pentium 4 and P6 models with SSE2 (i.e. 9,D,E,F).
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2009, 2010 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C TODO:
36C  * Optimize.  The present code was written quite straightforwardly.
37C  * Optimize post-loop reduction code.
38C  * Write a cps function that uses sse2 insns.
39
40C			    cycles/limb
41C P6 model 0-8,10-12		-
42C P6 model 9   (Banias)		?
43C P6 model 13  (Dothan)		3.4
44C P4 model 0-1 (Willamette)	?
45C P4 model 2   (Northwood)	4
46C P4 model 3-4 (Prescott)	4.5
47
48C INPUT PARAMETERS
49C ap		sp + 4
50C n		sp + 8
51C b		sp + 12
52C cps		sp + 16
53
54define(`B1modb', `%mm1')
55define(`B2modb', `%mm2')
56define(`B3modb', `%mm3')
57define(`B4modb', `%mm4')
58define(`B5modb', `%mm5')
59define(`ap',     `%edx')
60define(`n',      `%eax')
61
62ASM_START()
63	TEXT
64	ALIGN(16)
65PROLOGUE(mpn_mod_1s_4p)
66	push	%ebx
67	mov	8(%esp), ap
68	mov	12(%esp), n
69	mov	20(%esp), %ecx
70
71	movd	8(%ecx), B1modb
72	movd	12(%ecx), B2modb
73	movd	16(%ecx), B3modb
74	movd	20(%ecx), B4modb
75	movd	24(%ecx), B5modb
76
77	mov	n, %ebx
78	lea	-4(ap,n,4), ap
79	and	$3, %ebx
80	je	L(b0)
81	cmp	$2, %ebx
82	jc	L(b1)
83	je	L(b2)
84
85L(b3):	movd	-4(ap), %mm7
86	pmuludq	B1modb, %mm7
87	movd	-8(ap), %mm6
88	paddq	%mm6, %mm7
89	movd	(ap), %mm6
90	pmuludq	B2modb, %mm6
91	paddq	%mm6, %mm7
92	lea	-24(ap), ap
93	add	$-3, n
94	jz	L(end)
95	jmp	L(top)
96
97L(b0):	movd	-8(ap), %mm7
98	pmuludq	B1modb, %mm7
99	movd	-12(ap), %mm6
100	paddq	%mm6, %mm7
101	movd	-4(ap), %mm6
102	pmuludq	B2modb, %mm6
103	paddq	%mm6, %mm7
104	movd	(ap), %mm6
105	pmuludq	B3modb, %mm6
106	paddq	%mm6, %mm7
107	lea	-28(ap), ap
108	add	$-4, n
109	jz	L(end)
110	jmp	L(top)
111
112L(b1):	movd	(ap), %mm7
113	lea	-16(ap), ap
114	dec	n
115	jz	L(x)
116	jmp	L(top)
117
118L(b2):	movd	-4(ap), %mm7		C rl
119	punpckldq (ap), %mm7		C rh
120	lea	-20(ap), ap
121	add	$-2, n
122	jz	L(end)
123
124	ALIGN(8)
125L(top):	movd	4(ap), %mm0
126	pmuludq	B1modb, %mm0
127	movd	0(ap), %mm6
128	paddq	%mm6, %mm0
129
130	movd	8(ap), %mm6
131	pmuludq	B2modb, %mm6
132	paddq	%mm6, %mm0
133
134	movd	12(ap), %mm6
135	pmuludq	B3modb, %mm6
136	paddq	%mm6, %mm0
137
138	movq	%mm7, %mm6
139	psrlq	$32, %mm7		C rh
140	pmuludq	B5modb, %mm7
141	pmuludq	B4modb, %mm6
142
143	paddq	%mm0, %mm7
144	paddq	%mm6, %mm7
145
146	add	$-16, ap
147	add	$-4, n
148	jnz	L(top)
149
150L(end):	pcmpeqd	%mm4, %mm4
151	psrlq	$32, %mm4		C 0x00000000FFFFFFFF
152	pand	%mm7, %mm4		C rl
153	psrlq	$32, %mm7		C rh
154	pmuludq	B1modb, %mm7		C rh,cl
155	paddq	%mm4, %mm7		C rh,rl
156L(x):	movd	4(%ecx), %mm4		C cnt
157	psllq	%mm4, %mm7		C rh,rl normalized
158	movq	%mm7, %mm2		C rl in low half
159	psrlq	$32, %mm7		C rh
160	movd	(%ecx), %mm1		C bi
161	pmuludq	%mm7, %mm1		C qh,ql
162	paddq	%mm2, %mm1		C qh-1,ql
163	movd	%mm1, %ecx		C ql
164	psrlq	$32, %mm1		C qh-1
165	movd	16(%esp), %mm3		C b
166	pmuludq	%mm1, %mm3		C (qh-1) * b
167	psubq	%mm3, %mm2		C r in low half (could use psubd)
168	movd	%mm2, %eax		C r
169	mov	16(%esp), %ebx
170	sub	%ebx, %eax		C r
171	cmp	%eax, %ecx
172	lea	(%eax,%ebx), %edx
173	cmovc(	%edx, %eax)
174	movd	%mm4, %ecx		C cnt
175	cmp	%ebx, %eax
176	jae	L(fix)
177	emms
178	pop	%ebx
179	shr	%cl, %eax
180	ret
181
182L(fix):	sub	%ebx, %eax
183	emms
184	pop	%ebx
185	shr	%cl, %eax
186	ret
187EPILOGUE()
188
189	ALIGN(16)
190PROLOGUE(mpn_mod_1s_4p_cps)
191C CAUTION: This is the same code as in k7/mod_1_4.asm
192	push	%ebp
193	push	%edi
194	push	%esi
195	push	%ebx
196	mov	20(%esp), %ebp		C FIXME: avoid bp for 0-idx
197	mov	24(%esp), %ebx
198	bsr	%ebx, %ecx
199	xor	$31, %ecx
200	sal	%cl, %ebx		C b << cnt
201	mov	%ebx, %edx
202	not	%edx
203	mov	$-1, %eax
204	div	%ebx
205	xor	%edi, %edi
206	sub	%ebx, %edi
207	mov	$1, %esi
208	mov	%eax, (%ebp)		C store bi
209	mov	%ecx, 4(%ebp)		C store cnt
210	shld	%cl, %eax, %esi
211	imul	%edi, %esi
212	mov	%eax, %edi
213	mul	%esi
214
215	add	%esi, %edx
216	shr	%cl, %esi
217	mov	%esi, 8(%ebp)		C store B1modb
218
219	not	%edx
220	imul	%ebx, %edx
221	lea	(%edx,%ebx), %esi
222	cmp	%edx, %eax
223	cmovnc(	%edx, %esi)
224	mov	%edi, %eax
225	mul	%esi
226
227	add	%esi, %edx
228	shr	%cl, %esi
229	mov	%esi, 12(%ebp)		C store B2modb
230
231	not	%edx
232	imul	%ebx, %edx
233	lea	(%edx,%ebx), %esi
234	cmp	%edx, %eax
235	cmovnc(	%edx, %esi)
236	mov	%edi, %eax
237	mul	%esi
238
239	add	%esi, %edx
240	shr	%cl, %esi
241	mov	%esi, 16(%ebp)		C store B3modb
242
243	not	%edx
244	imul	%ebx, %edx
245	lea	(%edx,%ebx), %esi
246	cmp	%edx, %eax
247	cmovnc(	%edx, %esi)
248	mov	%edi, %eax
249	mul	%esi
250
251	add	%esi, %edx
252	shr	%cl, %esi
253	mov	%esi, 20(%ebp)		C store B4modb
254
255	not	%edx
256	imul	%ebx, %edx
257	add	%edx, %ebx
258	cmp	%edx, %eax
259	cmovnc(	%edx, %ebx)
260
261	shr	%cl, %ebx
262	mov	%ebx, 24(%ebp)		C store B5modb
263
264	pop	%ebx
265	pop	%esi
266	pop	%edi
267	pop	%ebp
268	ret
269EPILOGUE()
270