1dnl  Intel Pentium 4 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
2
3dnl  Copyright 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C Pentium4: 1.0 cycles/limb
24
25
26C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
27C
28C Enhancements:
29C
30C There might a couple of cycles to save by using plain integer code for
31C more small sizes.  2 limbs measures about 20 cycles, but 3 limbs jumps to
32C about 46 (inclusive of some function call overheads).
33
34defframe(PARAM_SIZE, 8)
35defframe(PARAM_SRC,  4)
36
37dnl  re-use parameter space
38define(SAVE_EBX, `PARAM_SRC')
39define(SAVE_ESI, `PARAM_SIZE')
40
41	TEXT
42	ALIGN(16)
43PROLOGUE(mpn_mod_34lsub1)
44deflit(`FRAME',0)
45
46	movl	PARAM_SIZE, %ecx
47	movl	PARAM_SRC, %edx
48	movl	(%edx), %eax
49
50	subl	$2, %ecx
51	ja	L(three_or_more)
52	jne	L(one)
53
54	movl	4(%edx), %edx
55	movl	%eax, %ecx
56	shrl	$24, %eax		C src[0] high
57
58	andl	$0x00FFFFFF, %ecx	C src[0] low
59	addl	%ecx, %eax
60
61	movl	%edx, %ecx
62	shll	$8, %edx
63
64	shrl	$16, %ecx		C src[1] low
65	addl	%ecx, %eax
66
67	andl	$0x00FFFF00, %edx	C src[1] high
68	addl	%edx, %eax
69
70L(one):
71	ret
72
73
74L(three_or_more):
75	pxor	%mm0, %mm0
76	pxor	%mm1, %mm1
77	pxor	%mm2, %mm2
78
79	pcmpeqd	%mm7, %mm7
80	psrlq	$32, %mm7	C 0x00000000FFFFFFFF, low 32 bits
81
82	pcmpeqd	%mm6, %mm6
83	psrlq	$40, %mm6	C 0x0000000000FFFFFF, low 24 bits
84
85L(top):
86	C eax
87	C ebx
88	C ecx	counter, size-2 to 0, -1 or -2
89	C edx	src, incrementing
90	C
91	C mm0	sum 0mod3
92	C mm1	sum 1mod3
93	C mm2	sum 2mod3
94	C mm3
95	C mm4
96	C mm5
97	C mm6	0x0000000000FFFFFF
98	C mm7	0x00000000FFFFFFFF
99
100	movd	(%edx), %mm3
101	paddq	%mm3, %mm0
102
103	movd	4(%edx), %mm3
104	paddq	%mm3, %mm1
105
106	movd	8(%edx), %mm3
107	paddq	%mm3, %mm2
108
109	addl	$12, %edx
110	subl	$3, %ecx
111	ja	L(top)
112
113
114	C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
115
116	addl	$1, %ecx
117	js	L(combine)		C 0 more
118
119	movd	(%edx), %mm3
120	paddq	%mm3, %mm0
121
122	jz	L(combine)		C 1 more
123
124	movd	4(%edx), %mm3
125	paddq	%mm3, %mm1
126
127L(combine):
128	movq	%mm7, %mm3		C low halves
129	pand	%mm0, %mm3
130
131	movq	%mm7, %mm4
132	pand	%mm1, %mm4
133
134	movq	%mm7, %mm5
135	pand	%mm2, %mm5
136
137	psrlq	$32, %mm0		C high halves
138	psrlq	$32, %mm1
139	psrlq	$32, %mm2
140
141	paddq	%mm0, %mm4		C fold high halves to give 33 bits each
142	paddq	%mm1, %mm5
143	paddq	%mm2, %mm3
144
145	psllq	$8, %mm4		C combine at respective offsets
146	psllq	$16, %mm5
147	paddq	%mm4, %mm3
148	paddq	%mm5, %mm3		C 0x000cxxxxxxxxxxxx, 50 bits
149
150	pand	%mm3, %mm6		C fold at 24 bits
151	psrlq	$24, %mm3
152
153	paddq	%mm6, %mm3
154	movd	%mm3, %eax
155
156	ASSERT(z,	C nothing left in high dword
157	`psrlq	$32, %mm3
158	movd	%mm3, %ecx
159	orl	%ecx, %ecx')
160
161	emms
162	ret
163
164EPILOGUE()
165