mod_34lsub1.asm revision 1.1.1.2
1dnl  Generic x86 mpn_mod_34lsub1 -- mpn remainder modulo 2^24-1.
2
3dnl  Copyright 2000, 2001, 2002, 2004 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C      cycles/limb
24C P5	  3.0
25C P6	  3.66
26C K6	  3.0
27C K7	  1.3
28C P4	  9
29
30
31C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
32C
33
34defframe(PARAM_SIZE, 8)
35defframe(PARAM_SRC,  4)
36
37dnl  re-use parameter space
38define(SAVE_EBX, `PARAM_SRC')
39
40	TEXT
41	ALIGN(16)
42PROLOGUE(mpn_mod_34lsub1)
43deflit(`FRAME',0)
44
45	movl	PARAM_SIZE, %ecx
46	movl	PARAM_SRC, %edx
47
48	subl	$2, %ecx
49	ja	L(three_or_more)
50
51	movl	(%edx), %eax
52	jb	L(one)
53
54	movl	4(%edx), %ecx
55	movl	%eax, %edx
56	shrl	$24, %eax		C src[0] low
57
58	andl	$0xFFFFFF, %edx		C src[0] high
59	addl	%edx, %eax
60	movl	%ecx, %edx
61
62	andl	$0xFFFF, %ecx
63	shrl	$16, %edx		C src[1] high
64	addl	%edx, %eax
65
66	shll	$8, %ecx		C src[1] low
67	addl	%ecx, %eax
68
69L(one):
70	ret
71
72
73L(three_or_more):
74	C eax
75	C ebx
76	C ecx	size-2
77	C edx	src
78	C esi
79	C edi
80	C ebp
81
82	movl	%ebx, SAVE_EBX		C and arrange 16-byte loop alignment
83	xorl	%ebx, %ebx
84
85	pushl	%esi	FRAME_pushl()
86	xorl	%esi, %esi
87
88	pushl	%edi	FRAME_pushl()
89	xorl	%eax, %eax		C and clear carry flag
90
91
92	C offset 0x40 here
93L(top):
94	C eax	acc 0mod3
95	C ebx	acc 1mod3
96	C ecx	counter, limbs
97	C edx	src
98	C esi	acc 2mod3
99	C edi
100	C ebp
101
102	leal	12(%edx), %edx
103	leal	-2(%ecx), %ecx
104
105	adcl	-12(%edx), %eax
106	adcl	-8(%edx), %ebx
107	adcl	-4(%edx), %esi
108
109	decl	%ecx
110	jg	L(top)
111
112
113	C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
114
115	movl	$0xFFFFFFFF, %edi
116	incl	%ecx
117	js	L(combine)
118
119	adcl	(%edx), %eax
120	movl	$0xFFFFFF00, %edi
121	decl	%ecx
122	js	L(combine)
123
124	adcl	4(%edx), %ebx
125	movl	$0xFFFF0000, %edi
126
127
128L(combine):
129	C eax	acc 0mod3
130	C ebx	acc 1mod3
131	C ecx
132	C edx
133	C esi	acc 2mod3
134	C edi	mask
135	C ebp
136
137	sbbl	%ecx, %ecx		C carry
138	movl	%eax, %edx		C 0mod3
139
140	shrl	$24, %eax		C 0mod3 high
141	andl	%edi, %ecx		C carry masked
142
143	subl	%ecx, %eax		C apply carry
144	movl	%ebx, %edi		C 1mod3
145
146	shrl	$16, %ebx		C 1mod3 high
147	andl	$0x00FFFFFF, %edx	C 0mod3 low
148
149	addl	%edx, %eax		C apply 0mod3 low
150	andl	$0xFFFF, %edi
151
152	shll	$8, %edi		C 1mod3 low
153	addl	%ebx, %eax		C apply 1mod3 high
154
155	addl	%edi, %eax		C apply 1mod3 low
156	movl	%esi, %edx		C 2mod3
157
158	shrl	$8, %esi		C 2mod3 high
159	andl	$0xFF, %edx		C 2mod3 low
160
161	shll	$16, %edx		C 2mod3 low
162	addl	%esi, %eax		C apply 2mod3 high
163
164	addl	%edx, %eax		C apply 2mod3 low
165	popl	%edi	FRAME_popl()
166
167	movl	SAVE_EBX, %ebx
168	popl	%esi	FRAME_popl()
169
170	ret
171
172EPILOGUE()
173