mode1o.asm revision 1.1.1.1
1dnl Intel Pentium mpn_modexact_1_odd -- exact division style remainder. 2 3dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C P5: 23.0 cycles/limb 24 25 26C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, 27C mp_limb_t divisor); 28C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, 29C mp_limb_t divisor, mp_limb_t carry); 30C 31C There seems no way to pair up the two lone instructions in the main loop. 32C 33C The special case for size==1 saves about 20 cycles (non-PIC), making it 34C the same as mpn_mod_1, and in fact making modexact faster than mod_1 at 35C all sizes. 36C 37C Alternatives: 38C 39C Using mmx for the multiplies might be possible, with pmullw and pmulhw 40C having just 3 cycle latencies, but carry bit handling would probably be 41C complicated. 42 43defframe(PARAM_CARRY, 16) 44defframe(PARAM_DIVISOR,12) 45defframe(PARAM_SIZE, 8) 46defframe(PARAM_SRC, 4) 47 48dnl re-using parameter space 49define(VAR_INVERSE,`PARAM_SIZE') 50 51 TEXT 52 53 ALIGN(16) 54PROLOGUE(mpn_modexact_1c_odd) 55deflit(`FRAME',0) 56 57 movl PARAM_DIVISOR, %eax 58 movl PARAM_CARRY, %edx 59 60 jmp L(start_1c) 61 62EPILOGUE() 63 64 ALIGN(16) 65PROLOGUE(mpn_modexact_1_odd) 66deflit(`FRAME',0) 67 68 movl PARAM_DIVISOR, %eax 69 xorl %edx, %edx C carry 70 71L(start_1c): 72 73ifdef(`PIC',` 74 call L(here) FRAME_pushl() 75L(here): 76 77 shrl %eax C d/2 78 movl (%esp), %ecx C eip 79 80 addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ecx 81 movl %ebx, (%esp) C push ebx 82 83 andl $127, %eax 84 movl PARAM_SIZE, %ebx 85 86 movl binvert_limb_table@GOT(%ecx), %ecx 87 subl $2, %ebx 88 89 movb (%eax,%ecx), %cl C inv 8 bits 90 jc L(one_limb) 91 92',` 93dnl non-PIC 94 shrl %eax C d/2 95 pushl %ebx FRAME_pushl() 96 97 movl PARAM_SIZE, %ebx 98 andl $127, %eax 99 100 subl $2, %ebx 101 jc L(one_limb) 102 103 movb binvert_limb_table(%eax), %cl C inv 8 bits 104') 105 106 movl %ecx, %eax 107 addl %ecx, %ecx C 2*inv 108 109 imull %eax, %eax C inv*inv 110 111 imull PARAM_DIVISOR, %eax C inv*inv*d 112 113 subl %eax, %ecx C inv = 2*inv - inv*inv*d 114 115 movl %ecx, %eax 116 addl %ecx, %ecx C 2*inv 117 118 imull %eax, %eax C inv*inv 119 120 imull PARAM_DIVISOR, %eax C inv*inv*d 121 122 subl %eax, %ecx C inv = 2*inv - inv*inv*d 123 pushl %esi FRAME_pushl() 124 125 ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS 126 movl %ecx, %eax 127 imull PARAM_DIVISOR, %eax 128 cmpl $1, %eax') 129 130 movl PARAM_SRC, %esi 131 movl %ecx, VAR_INVERSE 132 133 movl (%esi), %eax C src[0] 134 leal 4(%esi,%ebx,4), %esi C &src[size-1] 135 136 xorl $-1, %ebx C -(size-1) 137 ASSERT(nz) 138 jmp L(entry) 139 140 141C The use of VAR_INVERSE means only a store is needed for that value, rather 142C than a push and pop of say %edi. 143 144 ALIGN(16) 145L(top): 146 C eax scratch, low product 147 C ebx counter, limbs, negative 148 C ecx carry bit 149 C edx scratch, high product 150 C esi &src[size-1] 151 C edi 152 C ebp 153 154 mull PARAM_DIVISOR C h:dummy = q*d 155 156 movl (%esi,%ebx,4), %eax C src[i] 157 subl %ecx, %edx C h -= -c 158 159L(entry): 160 subl %edx, %eax C s = src[i] - h 161 162 sbbl %ecx, %ecx C new -c (0 or -1) 163 164 imull VAR_INVERSE, %eax C q = s*i 165 166 incl %ebx 167 jnz L(top) 168 169 170 mull PARAM_DIVISOR 171 172 movl (%esi), %eax C src high 173 subl %ecx, %edx C h -= -c 174 175 cmpl PARAM_DIVISOR, %eax 176 177 jbe L(skip_last) 178deflit(FRAME_LAST,FRAME) 179 180 181 subl %edx, %eax C s = src[i] - h 182 popl %esi FRAME_popl() 183 184 sbbl %ecx, %ecx C c (0 or -1) 185 popl %ebx FRAME_popl() 186 187 imull VAR_INVERSE, %eax C q = s*i 188 189 mull PARAM_DIVISOR C h:dummy = q*d 190 191 movl %edx, %eax 192 193 subl %ecx, %eax 194 195 ret 196 197 198C When high<divisor can skip last step. 199 200L(skip_last): 201deflit(`FRAME',FRAME_LAST) 202 C eax src high 203 C ebx 204 C ecx 205 C edx r 206 C esi 207 208 subl %eax, %edx C r-s 209 popl %esi FRAME_popl() 210 211 sbbl %eax, %eax C -1 if underflow 212 movl PARAM_DIVISOR, %ebx 213 214 andl %ebx, %eax C divisor if underflow 215 popl %ebx FRAME_popl() 216 217 addl %edx, %eax C addback if underflow 218 219 ret 220 221 222C Special case for size==1 using a division for r = c-a mod d. 223C Could look for a-c<d and save a division sometimes, but that doesn't seem 224C worth bothering about. 225 226L(one_limb): 227deflit(`FRAME',4) 228 C eax 229 C ebx size-2 (==-1) 230 C ecx 231 C edx carry 232 C esi src end 233 C edi 234 C ebp 235 236 movl %edx, %eax 237 movl PARAM_SRC, %edx 238 239 movl PARAM_DIVISOR, %ecx 240 popl %ebx FRAME_popl() 241 242 subl (%edx), %eax C c-a 243 244 sbbl %edx, %edx 245 decl %ecx C d-1 246 247 andl %ecx, %edx C b*d+c-a if c<a, or c-a if c>=a 248 249 divl PARAM_DIVISOR 250 251 movl %edx, %eax 252 253 ret 254 255EPILOGUE() 256