1dnl Intel Pentium mpn_modexact_1_odd -- exact division style remainder. 2 3dnl Copyright 2000-2002, 2014 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C P5: 23.0 cycles/limb 35 36 37C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size, 38C mp_limb_t divisor); 39C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, 40C mp_limb_t divisor, mp_limb_t carry); 41C 42C There seems no way to pair up the two lone instructions in the main loop. 43C 44C The special case for size==1 saves about 20 cycles (non-PIC), making it 45C the same as mpn_mod_1, and in fact making modexact faster than mod_1 at 46C all sizes. 47C 48C Alternatives: 49C 50C Using mmx for the multiplies might be possible, with pmullw and pmulhw 51C having just 3 cycle latencies, but carry bit handling would probably be 52C complicated. 53 54defframe(PARAM_CARRY, 16) 55defframe(PARAM_DIVISOR,12) 56defframe(PARAM_SIZE, 8) 57defframe(PARAM_SRC, 4) 58 59dnl re-using parameter space 60define(VAR_INVERSE,`PARAM_SIZE') 61 62 TEXT 63 64 ALIGN(16) 65PROLOGUE(mpn_modexact_1c_odd) 66deflit(`FRAME',0) 67 68 movl PARAM_DIVISOR, %eax 69 movl PARAM_CARRY, %edx 70 71 jmp L(start_1c) 72 73EPILOGUE() 74 75 ALIGN(16) 76PROLOGUE(mpn_modexact_1_odd) 77deflit(`FRAME',0) 78 79 movl PARAM_DIVISOR, %eax 80 xorl %edx, %edx C carry 81 82L(start_1c): 83 84ifdef(`PIC',` 85ifdef(`DARWIN',` 86 shrl %eax C d/2 87 LEA( binvert_limb_table, %ecx) 88 pushl %ebx FRAME_pushl() 89 movl PARAM_SIZE, %ebx 90 91 andl $127, %eax 92 subl $2, %ebx 93 94 movb (%eax,%ecx), %cl 95 jc L(one_limb) 96',` 97 call L(here) FRAME_pushl() 98L(here): 99 100 shrl %eax C d/2 101 movl (%esp), %ecx C eip 102 103 addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ecx 104 movl %ebx, (%esp) C push ebx 105 106 andl $127, %eax 107 movl PARAM_SIZE, %ebx 108 109 movl binvert_limb_table@GOT(%ecx), %ecx 110 subl $2, %ebx 111 112 movb (%eax,%ecx), %cl C inv 8 bits 113 jc L(one_limb) 114') 115',` 116dnl non-PIC 117 shrl %eax C d/2 118 pushl %ebx FRAME_pushl() 119 120 movl PARAM_SIZE, %ebx 121 andl $127, %eax 122 123 subl $2, %ebx 124 jc L(one_limb) 125 126 movb binvert_limb_table(%eax), %cl C inv 8 bits 127') 128 129 movl %ecx, %eax 130 addl %ecx, %ecx C 2*inv 131 132 imull %eax, %eax C inv*inv 133 134 imull PARAM_DIVISOR, %eax C inv*inv*d 135 136 subl %eax, %ecx C inv = 2*inv - inv*inv*d 137 138 movl %ecx, %eax 139 addl %ecx, %ecx C 2*inv 140 141 imull %eax, %eax C inv*inv 142 143 imull PARAM_DIVISOR, %eax C inv*inv*d 144 145 subl %eax, %ecx C inv = 2*inv - inv*inv*d 146 pushl %esi FRAME_pushl() 147 148 ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS 149 movl %ecx, %eax 150 imull PARAM_DIVISOR, %eax 151 cmpl $1, %eax') 152 153 movl PARAM_SRC, %esi 154 movl %ecx, VAR_INVERSE 155 156 movl (%esi), %eax C src[0] 157 leal 4(%esi,%ebx,4), %esi C &src[size-1] 158 159 xorl $-1, %ebx C -(size-1) 160 ASSERT(nz) 161 jmp L(entry) 162 163 164C The use of VAR_INVERSE means only a store is needed for that value, rather 165C than a push and pop of say %edi. 166 167 ALIGN(16) 168L(top): 169 C eax scratch, low product 170 C ebx counter, limbs, negative 171 C ecx carry bit 172 C edx scratch, high product 173 C esi &src[size-1] 174 C edi 175 C ebp 176 177 mull PARAM_DIVISOR C h:dummy = q*d 178 179 movl (%esi,%ebx,4), %eax C src[i] 180 subl %ecx, %edx C h -= -c 181 182L(entry): 183 subl %edx, %eax C s = src[i] - h 184 185 sbbl %ecx, %ecx C new -c (0 or -1) 186 187 imull VAR_INVERSE, %eax C q = s*i 188 189 incl %ebx 190 jnz L(top) 191 192 193 mull PARAM_DIVISOR 194 195 movl (%esi), %eax C src high 196 subl %ecx, %edx C h -= -c 197 198 cmpl PARAM_DIVISOR, %eax 199 200 jbe L(skip_last) 201deflit(FRAME_LAST,FRAME) 202 203 204 subl %edx, %eax C s = src[i] - h 205 popl %esi FRAME_popl() 206 207 sbbl %ecx, %ecx C c (0 or -1) 208 popl %ebx FRAME_popl() 209 210 imull VAR_INVERSE, %eax C q = s*i 211 212 mull PARAM_DIVISOR C h:dummy = q*d 213 214 movl %edx, %eax 215 216 subl %ecx, %eax 217 218 ret 219 220 221C When high<divisor can skip last step. 222 223L(skip_last): 224deflit(`FRAME',FRAME_LAST) 225 C eax src high 226 C ebx 227 C ecx 228 C edx r 229 C esi 230 231 subl %eax, %edx C r-s 232 popl %esi FRAME_popl() 233 234 sbbl %eax, %eax C -1 if underflow 235 movl PARAM_DIVISOR, %ebx 236 237 andl %ebx, %eax C divisor if underflow 238 popl %ebx FRAME_popl() 239 240 addl %edx, %eax C addback if underflow 241 242 ret 243 244 245C Special case for size==1 using a division for r = c-a mod d. 246C Could look for a-c<d and save a division sometimes, but that doesn't seem 247C worth bothering about. 248 249L(one_limb): 250deflit(`FRAME',4) 251 C eax 252 C ebx size-2 (==-1) 253 C ecx 254 C edx carry 255 C esi src end 256 C edi 257 C ebp 258 259 movl %edx, %eax 260 movl PARAM_SRC, %edx 261 262 movl PARAM_DIVISOR, %ecx 263 popl %ebx FRAME_popl() 264 265 subl (%edx), %eax C c-a 266 267 sbbl %edx, %edx 268 decl %ecx C d-1 269 270 andl %ecx, %edx C b*d+c-a if c<a, or c-a if c>=a 271 272 divl PARAM_DIVISOR 273 274 movl %edx, %eax 275 276 ret 277 278EPILOGUE() 279ASM_END() 280