1dnl  AMD K6 mpn_mod_34lsub1 -- mpn remainder modulo 2**24-1.
2
3dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C K6: 2.66 cycles/limb
24
25
26C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
27C
28C An attempt was made to use a loop like
29C
30C L(top):
31C	adcl	(%edx), %eax
32C	adcl	4(%edx), %ebx
33C	adcl	8(%edx), %esi
34C	leal	12(%edx), %edx
35C	loop	L(top)
36C
37C with %ecx starting from floor(size/3), but it still measured 2.66 c/l.
38C The form used instead can save about 6 cycles by not dividing by 3.
39C
40C In the code used, putting the "leal"s at the top of the loop is necessary
41C for the claimed speed, anywhere else costs an extra cycle per loop.
42C Perhaps a tight loop like this needs short decode instructions at the
43C branch target, which would explain the leal/loop form above taking 8
44C cycles instead of 7 too.
45
46defframe(PARAM_SIZE, 8)
47defframe(PARAM_SRC,  4)
48
49dnl  re-use parameter space
50define(SAVE_EBX, `PARAM_SIZE')
51define(SAVE_ESI, `PARAM_SRC')
52
53	TEXT
54	ALIGN(16)
55PROLOGUE(mpn_mod_34lsub1)
56deflit(`FRAME',0)
57
58	movl	PARAM_SIZE, %eax
59	movl	PARAM_SRC, %edx
60
61	subl	$2, %eax
62	ja	L(three_or_more)
63
64Zdisp(	movl,	0,(%edx), %eax)		C avoid code cache line boundary
65	jne	L(one)
66
67	movl	%eax, %ecx
68	movl	4(%edx), %edx
69
70	shrl	$24, %eax		C src[0] high
71	andl	$0x00FFFFFF, %ecx	C src[0] low
72
73	addl	%ecx, %eax
74	movl	%edx, %ecx
75
76	shll	$8, %edx
77	andl	$0x00FFFF00, %edx	C src[1] high
78
79	shrl	$16, %ecx		C src[1] low
80	addl	%ecx, %eax
81
82	addl	%edx, %eax
83
84L(one):
85	ret
86
87
88L(three_or_more):
89	C eax	size-2
90	C ebx
91	C ecx
92	C edx	src
93
94	movl	%ebx, SAVE_EBX
95	xorl	%ebx, %ebx
96
97	movl	%esi, SAVE_ESI
98	pushl	%edi	FRAME_pushl()
99
100	xorl	%esi, %esi
101	xorl	%edi, %edi		C and clear carry flag
102
103L(top):
104	C eax	counter, limbs
105	C ebx	acc 0mod3
106	C ecx
107	C edx	src, incrementing
108	C esi	acc 1mod3
109	C edi	acc 2mod3
110	C ebp
111
112	leal	-2(%eax), %eax
113	leal	12(%edx), %edx
114
115	adcl	-12(%edx), %ebx
116	adcl	-8(%edx), %esi
117	adcl	-4(%edx), %edi
118
119	decl	%eax
120	jg	L(top)
121
122
123	C ecx is -3, -2 or -1 representing 0, 1 or 2 more limbs, respectively
124
125	movb	$0, %cl
126	incl	%eax
127
128	js	L(combine)		C 0 more
129
130Zdisp(	adcl,	0,(%edx), %ebx)		C avoid code cache line crossings
131
132	movb	$8, %cl
133	decl	%eax
134
135	js	L(combine)		C 1 more
136
137	adcl	4(%edx), %esi
138
139	movb	$16, %cl
140
141
142L(combine):
143	sbbl	%edx, %edx
144
145	shll	%cl, %edx		C carry
146	movl	%ebx, %eax		C 0mod3
147
148	shrl	$24, %eax		C 0mod3 high
149	andl	$0x00FFFFFF, %ebx	C 0mod3 low
150
151	subl	%edx, %eax		C apply carry
152	movl	%esi, %ecx		C 1mod3
153
154	shrl	$16, %esi		C 1mod3 high
155	addl	%ebx, %eax		C apply 0mod3 low
156
157	andl	$0x0000FFFF, %ecx
158	addl	%esi, %eax		C apply 1mod3 high
159
160	shll	$8, %ecx		C 1mod3 low
161	movl	%edi, %edx		C 2mod3
162
163	shrl	$8, %edx		C 2mod3 high
164	addl	%ecx, %eax		C apply 1mod3 low
165
166	addl	%edx, %eax		C apply 2mod3 high
167	andl	$0x000000FF, %edi
168
169	shll	$16, %edi		C 2mod3 low
170	movl	SAVE_EBX, %ebx
171
172	addl	%edi, %eax		C apply 2mod3 low
173	movl	SAVE_ESI, %esi
174
175	popl	%edi
176
177	ret
178
179EPILOGUE()
180