1dnl AMD K7 mpn_gcd_1 -- mpn by 1 gcd. 2 3dnl Copyright 2000, 2001, 2002, 2009 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C K7: 6.75 cycles/bit (approx) 1x1 gcd 24C 11.0 cycles/limb Nx1 reduction (modexact_1_odd) 25 26 27dnl Reduce using x%y if x is more than DIV_THRESHOLD bits bigger than y, 28dnl where x is the larger of the two. See tune/README for more. 29dnl 30dnl divl at 40 cycles compared to the gcd at about 7 cycles/bitpair 31dnl suggests 40/7*2=11.4 but 7 seems to be about right. 32 33deflit(DIV_THRESHOLD, 7) 34 35 36C table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. 37C 38C This is mixed in with the code, but as per the k7 optimization manual it's 39C a full cache line and suitably aligned so it won't get swapped between 40C code and data. Having it in TEXT rather than RODATA saves needing a GOT 41C entry when PIC. 42C 43C Actually, there doesn't seem to be a measurable difference between this in 44C it's own cache line or plonked in the middle of the code. Presumably 45C since TEXT is read-only there's no worries about coherency. 46 47deflit(MAXSHIFT, 6) 48deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) 49 50 TEXT 51 ALIGN(64) 52L(table): 53 .byte MAXSHIFT 54forloop(i,1,MASK, 55` .byte m4_count_trailing_zeros(i) 56') 57 58 59C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t limb); 60C 61 62defframe(PARAM_LIMB, 12) 63defframe(PARAM_SIZE, 8) 64defframe(PARAM_SRC, 4) 65 66defframe(SAVE_EBX, -4) 67defframe(SAVE_ESI, -8) 68defframe(SAVE_EDI, -12) 69defframe(SAVE_EBP, -16) 70defframe(CALL_DIVISOR,-20) 71defframe(CALL_SIZE, -24) 72defframe(CALL_SRC, -28) 73 74deflit(STACK_SPACE, 28) 75 76 TEXT 77 ALIGN(16) 78 79PROLOGUE(mpn_gcd_1) 80deflit(`FRAME',0) 81 82 ASSERT(ne, `cmpl $0, PARAM_LIMB') C y!=0 83 ASSERT(ae, `cmpl $1, PARAM_SIZE') C size>=1 84 85 mov PARAM_SRC, %eax 86 mov PARAM_LIMB, %edx 87 sub $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE) 88 89 mov %esi, SAVE_ESI 90 mov %ebx, SAVE_EBX 91 92 mov (%eax), %esi C src low limb 93 94ifdef(`PIC',` 95 mov %edi, SAVE_EDI 96 call L(movl_eip_to_edi) 97L(here): 98 add $L(table)-L(here), %edi 99') 100 101 mov %esi, %ebx 102 or %edx, %esi C x|y 103 mov $-1, %ecx 104 105L(twos): 106 inc %ecx 107 shr %esi 108 jnc L(twos) C 3/4 chance of x or y odd already 109 110 shr %cl, %ebx 111 shr %cl, %edx 112 mov %ecx, %esi C common twos 113 114 mov PARAM_SIZE, %ecx 115 cmp $1, %ecx 116 ja L(divide) 117 118 119 C eax 120 C ebx x 121 C ecx 122 C edx y 123 C esi common twos 124 C edi [PIC] L(table) 125 C ebp 126 127 mov %edx, %eax 128 cmp %ebx, %edx 129 130 cmovb( %ebx, %eax) C swap to make x bigger than y 131 cmovb( %edx, %ebx) 132 133 134L(strip_y): 135 C eax x 136 C ebx y 137 C ecx 138 C edx 139 C esi common twos 140 C edi [PIC] L(table) 141 C ebp 142 143 ASSERT(nz,`orl %ebx,%ebx') 144 shr %ebx 145 jnc L(strip_y) 146 rcl %ebx 147 148 149 C eax x 150 C ebx y (odd) 151 C ecx 152 C edx 153 C esi common twos 154 C edi [PIC] L(table) 155 C ebp 156 157 mov %eax, %ecx 158 mov %ebx, %edx 159 shr $DIV_THRESHOLD, %eax 160 161 cmp %eax, %ebx 162 mov %ecx, %eax 163 ja L(strip_x_entry) C do x%y if x much bigger than y 164 165 166 xor %edx, %edx 167 168 div %ebx 169 170 or %edx, %edx 171 mov %edx, %ecx C remainder -> x 172 mov %ebx, %edx C y 173 174 jz L(done_ebx) 175 jmp L(strip_x) 176 177 178 C Offset 0x9D here for non-PIC. About 0.4 cycles/bit is saved by 179 C ensuring the end of the jnz at the end of this loop doesn't cross 180 C into the next cache line at 0xC0. 181 C 182 C PIC on the other hand is offset 0xAC here and extends to 0xC9, so 183 C it crosses but doesn't suffer any measurable slowdown. 184 185L(top): 186 C eax x 187 C ebx y-x 188 C ecx x-y 189 C edx y 190 C esi twos, for use at end 191 C edi [PIC] L(table) 192 193 cmovc( %ebx, %ecx) C if x-y gave carry, use x and y-x 194 cmovc( %eax, %edx) 195 196L(strip_x): 197 mov %ecx, %eax 198L(strip_x_entry): 199 and $MASK, %ecx 200 201 ASSERT(nz, `orl %eax, %eax') 202 203ifdef(`PIC',` 204 mov (%ecx,%edi), %cl 205',` 206 mov L(table) (%ecx), %cl 207') 208 209 shr %cl, %eax 210 cmp $MAXSHIFT, %cl 211 212 mov %eax, %ecx 213 mov %edx, %ebx 214 je L(strip_x) 215 216 ASSERT(nz, `test $1, %eax') C both odd 217 ASSERT(nz, `test $1, %edx') 218 219 sub %eax, %ebx 220 sub %edx, %ecx 221 jnz L(top) 222 223 224L(done): 225 mov %esi, %ecx 226 mov SAVE_ESI, %esi 227ifdef(`PIC',` 228 mov SAVE_EDI, %edi 229') 230 231 shl %cl, %eax 232 mov SAVE_EBX, %ebx 233 add $FRAME, %esp 234 235 ret 236 237 238 239C ----------------------------------------------------------------------------- 240C two or more limbs 241 242dnl MODEXACT_THRESHOLD is the size at which it's better to call 243dnl mpn_modexact_1_odd than do an inline loop. 244 245deflit(MODEXACT_THRESHOLD, ifdef(`PIC',6,5)) 246 247L(divide): 248 C eax src 249 C ebx 250 C ecx size 251 C edx y 252 C esi common twos 253 C edi [PIC] L(table) 254 C ebp 255 256L(divide_strip_y): 257 ASSERT(nz,`or %edx,%edx') 258 shr %edx 259 jnc L(divide_strip_y) 260 lea 1(%edx,%edx), %ebx C y now odd 261 262 mov %ebp, SAVE_EBP 263 mov %eax, %ebp 264 mov -4(%eax,%ecx,4), %eax C src high limb 265 266 cmp $MODEXACT_THRESHOLD, %ecx 267 jae L(modexact) 268 269 cmp %ebx, %eax C high cmp divisor 270 mov $0, %edx 271 272 cmovc( %eax, %edx) C skip a div if high<divisor 273 sbb $0, %ecx 274 275 276L(divide_top): 277 C eax scratch (quotient) 278 C ebx y 279 C ecx counter (size to 1, inclusive) 280 C edx carry (remainder) 281 C esi common twos 282 C edi [PIC] L(table) 283 C ebp src 284 285 mov -4(%ebp,%ecx,4), %eax 286 287 div %ebx 288 289 dec %ecx 290 jnz L(divide_top) 291 292 293 C eax 294 C ebx y (odd) 295 C ecx 296 C edx x 297 C esi common twos 298 C edi [PIC] L(table) 299 C ebp 300 301 or %edx, %edx 302 mov SAVE_EBP, %ebp 303 mov %edx, %eax 304 305 mov %edx, %ecx 306 mov %ebx, %edx 307 jnz L(strip_x_entry) 308 309 310L(done_ebx): 311 mov %ebx, %eax 312 jmp L(done) 313 314 315 316L(modexact): 317 C eax 318 C ebx y 319 C ecx size 320 C edx 321 C esi common twos 322 C edi [PIC] L(table) 323 C ebp src 324 325ifdef(`PIC',` 326 mov %ebp, CALL_SRC 327 mov %ebx, %ebp C y 328 mov %edi, %ebx C L(table) 329 330 add $_GLOBAL_OFFSET_TABLE_+[.-L(table)], %ebx 331 mov %ebp, CALL_DIVISOR 332 mov %ecx, CALL_SIZE 333 334 call GSYM_PREFIX`'mpn_modexact_1_odd@PLT 335',` 336dnl non-PIC 337 mov %ebx, CALL_DIVISOR 338 mov %ebp, CALL_SRC 339 mov %ecx, CALL_SIZE 340 341 call GSYM_PREFIX`'mpn_modexact_1_odd 342') 343 344 C eax x 345 C ebx [non-PIC] y 346 C ecx 347 C edx 348 C esi common twos 349 C edi [PIC] L(table) 350 C ebp [PIC] y 351 352 or %eax, %eax 353 mov ifdef(`PIC',`%ebp',`%ebx'), %edx 354 mov SAVE_EBP, %ebp 355 356 mov %eax, %ecx 357 jnz L(strip_x_entry) 358 359 mov %edx, %eax 360 jmp L(done) 361 362 363ifdef(`PIC', ` 364L(movl_eip_to_edi): 365 mov (%esp), %edi 366 ret_internal 367') 368 369EPILOGUE() 370