mod_34lsub1.asm revision 1.1.1.1
1dnl  Intel P6 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
2
3dnl  Copyright 2000, 2001, 2002, 2004 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C P6: 2.0 cycles/limb
24
25C TODO
26C  Experiments with more unrolling indicate that 1.5 c/l is possible on P6-13
27C  with the current carry handling scheme.
28
29C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
30C
31C Groups of three limbs are handled, with carry bits from 0mod3 into 1mod3
32C into 2mod3, but at that point going into a separate carries total so we
33C don't keep the carry flag live across the loop control.  Avoiding decl
34C lets us get to 2.0 c/l, as compared to the generic x86 code at 3.66.
35C
36
37defframe(PARAM_SIZE, 8)
38defframe(PARAM_SRC,  4)
39
40dnl  re-use parameter space
41define(SAVE_EBX, `PARAM_SIZE')
42define(SAVE_ESI, `PARAM_SRC')
43
44	TEXT
45	ALIGN(16)
46PROLOGUE(mpn_mod_34lsub1)
47deflit(`FRAME',0)
48
49	movl	PARAM_SIZE, %ecx
50	movl	PARAM_SRC, %edx
51
52	subl	$2, %ecx		C size-2
53	movl	(%edx), %eax		C src[0]
54	ja	L(three_or_more)
55	jb	L(one)
56
57	C size==2
58
59	movl	4(%edx), %ecx		C src[1]
60
61	movl	%eax, %edx		C src[0]
62	shrl	$24, %eax		C src[0] high
63
64	andl	$0xFFFFFF, %edx		C src[0] low
65
66	addl	%edx, %eax
67	movl	%ecx, %edx		C src[1]
68	shrl	$16, %ecx		C src[1] high
69
70	andl	$0xFFFF, %edx
71	addl	%ecx, %eax
72
73	shll	$8, %edx		C src[1] low
74
75	addl	%edx, %eax
76L(one):
77	ret
78
79
80L(three_or_more):
81	C eax	src[0], initial acc 0mod3
82	C ebx
83	C ecx	size-2
84	C edx	src
85	C esi
86	C edi
87	C ebp
88
89	movl	%ebx, SAVE_EBX
90	movl	4(%edx), %ebx		C src[1], initial 1mod3
91	subl	$3, %ecx		C size-5
92
93	movl	%esi, SAVE_ESI
94	movl	8(%edx), %esi		C src[2], initial 2mod3
95
96	pushl	%edi	FRAME_pushl()
97	movl	$0, %edi		C initial carries 0mod3
98	jng	L(done)			C if size < 6
99
100
101L(top):
102	C eax	acc 0mod3
103	C ebx	acc 1mod3
104	C ecx	counter, limbs
105	C edx	src
106	C esi	acc 2mod3
107	C edi	carrys into 0mod3
108	C ebp
109
110	addl	12(%edx), %eax
111	adcl	16(%edx), %ebx
112	adcl	20(%edx), %esi
113	leal	12(%edx), %edx
114	adcl	$0, %edi
115
116	subl	$3, %ecx
117	jg	L(top)			C at least 3 more to process
118
119
120L(done):
121	C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs respectively
122	cmpl	$-1, %ecx
123	jl	L(done_0)		C if -2, meaning 0 more limbs
124
125	C 1 or 2 more limbs
126	movl	$0, %ecx
127	je	L(done_1)		C if -1, meaning 1 more limb only
128	movl	16(%edx), %ecx
129L(done_1):
130	addl	12(%edx), %eax		C 0mod3
131	adcl	%ecx, %ebx		C 1mod3
132	adcl	$0, %esi		C 2mod3
133	adcl	$0, %edi		C carries 0mod3
134
135L(done_0):
136	C eax	acc 0mod3
137	C ebx	acc 1mod3
138	C ecx
139	C edx
140	C esi	acc 2mod3
141	C edi	carries 0mod3
142	C ebp
143
144	movl	%eax, %ecx		C 0mod3
145	shrl	$24, %eax		C 0mod3 high initial total
146
147	andl	$0xFFFFFF, %ecx		C 0mod3 low
148	movl	%edi, %edx		C carries
149	shrl	$24, %edi		C carries high
150
151	addl	%ecx, %eax		C add 0mod3 low
152	andl	$0xFFFFFF, %edx		C carries 0mod3 low
153	movl	%ebx, %ecx		C 1mod3
154
155	shrl	$16, %ebx		C 1mod3 high
156	addl	%edi, %eax		C add carries high
157	addl	%edx, %eax		C add carries 0mod3 low
158
159	andl	$0xFFFF, %ecx		C 1mod3 low mask
160	addl	%ebx, %eax		C add 1mod3 high
161	movl	SAVE_EBX, %ebx
162
163	shll	$8, %ecx		C 1mod3 low
164	movl	%esi, %edx		C 2mod3
165	popl	%edi	FRAME_popl()
166
167	shrl	$8, %esi		C 2mod3 high
168	andl	$0xFF, %edx		C 2mod3 low mask
169	addl	%ecx, %eax		C add 1mod3 low
170
171	shll	$16, %edx		C 2mod3 low
172	addl	%esi, %eax		C add 2mod3 high
173	movl	SAVE_ESI, %esi
174
175	addl	%edx, %eax		C add 2mod3 low
176
177	ret
178
179EPILOGUE()
180