1dnl  Generic x86 mpn_mod_34lsub1 -- mpn remainder modulo 2^24-1.
2
3dnl  Copyright 2000-2002, 2004 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C      cycles/limb
35C P5	  3.0
36C P6	  3.66
37C K6	  3.0
38C K7	  1.3
39C P4	  9
40
41
42C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
43C
44
45defframe(PARAM_SIZE, 8)
46defframe(PARAM_SRC,  4)
47
48dnl  re-use parameter space
49define(SAVE_EBX, `PARAM_SRC')
50
51	TEXT
52	ALIGN(16)
53PROLOGUE(mpn_mod_34lsub1)
54deflit(`FRAME',0)
55
56	movl	PARAM_SIZE, %ecx
57	movl	PARAM_SRC, %edx
58
59	subl	$2, %ecx
60	ja	L(three_or_more)
61
62	movl	(%edx), %eax
63	jb	L(one)
64
65	movl	4(%edx), %ecx
66	movl	%eax, %edx
67	shrl	$24, %eax		C src[0] low
68
69	andl	$0xFFFFFF, %edx		C src[0] high
70	addl	%edx, %eax
71	movl	%ecx, %edx
72
73	andl	$0xFFFF, %ecx
74	shrl	$16, %edx		C src[1] high
75	addl	%edx, %eax
76
77	shll	$8, %ecx		C src[1] low
78	addl	%ecx, %eax
79
80L(one):
81	ret
82
83
84L(three_or_more):
85	C eax
86	C ebx
87	C ecx	size-2
88	C edx	src
89	C esi
90	C edi
91	C ebp
92
93	movl	%ebx, SAVE_EBX		C and arrange 16-byte loop alignment
94	xorl	%ebx, %ebx
95
96	pushl	%esi	FRAME_pushl()
97	xorl	%esi, %esi
98
99	pushl	%edi	FRAME_pushl()
100	xorl	%eax, %eax		C and clear carry flag
101
102
103	C offset 0x40 here
104L(top):
105	C eax	acc 0mod3
106	C ebx	acc 1mod3
107	C ecx	counter, limbs
108	C edx	src
109	C esi	acc 2mod3
110	C edi
111	C ebp
112
113	leal	12(%edx), %edx
114	leal	-2(%ecx), %ecx
115
116	adcl	-12(%edx), %eax
117	adcl	-8(%edx), %ebx
118	adcl	-4(%edx), %esi
119
120	decl	%ecx
121	jg	L(top)
122
123
124	C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
125
126	movl	$0xFFFFFFFF, %edi
127	incl	%ecx
128	js	L(combine)
129
130	adcl	(%edx), %eax
131	movl	$0xFFFFFF00, %edi
132	decl	%ecx
133	js	L(combine)
134
135	adcl	4(%edx), %ebx
136	movl	$0xFFFF0000, %edi
137
138
139L(combine):
140	C eax	acc 0mod3
141	C ebx	acc 1mod3
142	C ecx
143	C edx
144	C esi	acc 2mod3
145	C edi	mask
146	C ebp
147
148	sbbl	%ecx, %ecx		C carry
149	movl	%eax, %edx		C 0mod3
150
151	shrl	$24, %eax		C 0mod3 high
152	andl	%edi, %ecx		C carry masked
153
154	subl	%ecx, %eax		C apply carry
155	movl	%ebx, %edi		C 1mod3
156
157	shrl	$16, %ebx		C 1mod3 high
158	andl	$0x00FFFFFF, %edx	C 0mod3 low
159
160	addl	%edx, %eax		C apply 0mod3 low
161	andl	$0xFFFF, %edi
162
163	shll	$8, %edi		C 1mod3 low
164	addl	%ebx, %eax		C apply 1mod3 high
165
166	addl	%edi, %eax		C apply 1mod3 low
167	movl	%esi, %edx		C 2mod3
168
169	shrl	$8, %esi		C 2mod3 high
170	andl	$0xFF, %edx		C 2mod3 low
171
172	shll	$16, %edx		C 2mod3 low
173	addl	%esi, %eax		C apply 2mod3 high
174
175	addl	%edx, %eax		C apply 2mod3 low
176	popl	%edi	FRAME_popl()
177
178	movl	SAVE_EBX, %ebx
179	popl	%esi	FRAME_popl()
180
181	ret
182
183EPILOGUE()
184