1dnl  AMD K7 mpn_modexact_1_odd -- exact division style remainder.
2
3dnl  Copyright 2000, 2001, 2002, 2004, 2007 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C          cycles/limb
24C Athlon:     11.0
25C Hammer:      7.0
26
27
28C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
29C                               mp_limb_t divisor);
30C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
31C                                mp_limb_t divisor, mp_limb_t carry);
32C
33C With the loop running at just 11 cycles it doesn't seem worth bothering to
34C check for high<divisor to save one step.
35C
36C Using a divl for size==1 measures slower than the modexact method, which
37C is not too surprising since for the latter it's only about 24 cycles to
38C calculate the modular inverse.
39
40defframe(PARAM_CARRY,  16)
41defframe(PARAM_DIVISOR,12)
42defframe(PARAM_SIZE,   8)
43defframe(PARAM_SRC,    4)
44
45defframe(SAVE_EBX,     -4)
46defframe(SAVE_ESI,     -8)
47defframe(SAVE_EDI,    -12)
48defframe(SAVE_EBP,    -16)
49
50deflit(STACK_SPACE, 16)
51
52	TEXT
53
54	ALIGN(16)
55PROLOGUE(mpn_modexact_1c_odd)
56deflit(`FRAME',0)
57
58	movl	PARAM_CARRY, %ecx
59	jmp	L(start_1c)
60
61EPILOGUE()
62
63
64	ALIGN(16)
65PROLOGUE(mpn_modexact_1_odd)
66deflit(`FRAME',0)
67
68	xorl	%ecx, %ecx
69L(start_1c):
70	movl	PARAM_DIVISOR, %eax
71	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
72
73	movl	%esi, SAVE_ESI
74	movl	PARAM_DIVISOR, %esi
75
76	movl	%edi, SAVE_EDI
77
78	shrl	%eax			C d/2
79
80	andl	$127, %eax
81
82ifdef(`PIC',`
83	LEA(	binvert_limb_table, %edi)
84	movzbl	(%eax,%edi), %edi		C inv 8 bits
85',`
86	movzbl	binvert_limb_table(%eax), %edi	C inv 8 bits
87')
88
89	xorl	%edx, %edx		C initial extra carry
90	leal	(%edi,%edi), %eax	C 2*inv
91
92	imull	%edi, %edi		C inv*inv
93
94	movl	%ebp, SAVE_EBP
95	movl	PARAM_SIZE, %ebp
96
97	movl	%ebx, SAVE_EBX
98	movl	PARAM_SRC, %ebx
99
100	imull	%esi, %edi		C inv*inv*d
101
102	subl	%edi, %eax		C inv = 2*inv - inv*inv*d
103	leal	(%eax,%eax), %edi	C 2*inv
104
105	imull	%eax, %eax		C inv*inv
106
107	imull	%esi, %eax		C inv*inv*d
108
109	leal	(%ebx,%ebp,4), %ebx	C src end
110	negl	%ebp			C -size
111
112	subl	%eax, %edi		C inv = 2*inv - inv*inv*d
113
114	ASSERT(e,`	C d*inv == 1 mod 2^GMP_LIMB_BITS
115	movl	%esi, %eax
116	imull	%edi, %eax
117	cmpl	$1, %eax')
118
119
120C The dependent chain here is
121C
122C                            cycles
123C	subl	%edx, %eax	1
124C	imull	%edi, %eax	4
125C	mull	%esi		6  (high limb)
126C			      ----
127C       total		       11
128C
129C Out of order execution hides the load latency for the source data, so no
130C special scheduling is required.
131
132L(top):
133	C eax	src limb
134	C ebx	src end ptr
135	C ecx	next carry bit, 0 or 1 (or initial carry param)
136	C edx	carry limb, high of last product
137	C esi	divisor
138	C edi	inverse
139	C ebp	counter, limbs, negative
140
141	movl	(%ebx,%ebp,4), %eax
142
143	subl	%ecx, %eax		C apply carry bit
144	movl	$0, %ecx
145
146	setc	%cl			C new carry bit
147
148	subl	%edx, %eax		C apply carry limb
149	adcl	$0, %ecx
150
151	imull	%edi, %eax
152
153	mull	%esi
154
155	incl	%ebp
156	jnz	L(top)
157
158
159	movl	SAVE_ESI, %esi
160	movl	SAVE_EDI, %edi
161	leal	(%ecx,%edx), %eax
162
163	movl	SAVE_EBX, %ebx
164	movl	SAVE_EBP, %ebp
165	addl	$STACK_SPACE, %esp
166
167	ret
168
169EPILOGUE()
170