1dnl x86 mpn_gcd_11 optimised for AMD K7. 2 3dnl Contributed to the GNU project by by Kevin Ryde. Rehacked by Torbjorn 4dnl Granlund. 5 6dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2014, 2015 Free Software 7dnl Foundation, Inc. 8 9dnl This file is part of the GNU MP Library. 10dnl 11dnl The GNU MP Library is free software; you can redistribute it and/or modify 12dnl it under the terms of either: 13dnl 14dnl * the GNU Lesser General Public License as published by the Free 15dnl Software Foundation; either version 3 of the License, or (at your 16dnl option) any later version. 17dnl 18dnl or 19dnl 20dnl * the GNU General Public License as published by the Free Software 21dnl Foundation; either version 2 of the License, or (at your option) any 22dnl later version. 23dnl 24dnl or both in parallel, as here. 25dnl 26dnl The GNU MP Library is distributed in the hope that it will be useful, but 27dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 28dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 29dnl for more details. 30dnl 31dnl You should have received copies of the GNU General Public License and the 32dnl GNU Lesser General Public License along with the GNU MP Library. If not, 33dnl see https://www.gnu.org/licenses/. 34 35include(`../config.m4') 36 37 38C cycles/bit (approx) 39C AMD K7 5.31 40C AMD K8,K9 5.33 41C AMD K10 5.30 42C AMD bd1 ? 43C AMD bobcat 7.02 44C Intel P4-2 10.1 45C Intel P4-3/4 10.0 46C Intel P6/13 5.88 47C Intel core2 6.26 48C Intel NHM 6.83 49C Intel SBR 8.50 50C Intel atom 8.90 51C VIA nano ? 52C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1 53 54 55C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. 56 57deflit(MAXSHIFT, 6) 58deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) 59 60DEF_OBJECT(ctz_table,64) 61 .byte MAXSHIFT 62forloop(i,1,MASK, 63` .byte m4_count_trailing_zeros(i) 64') 65END_OBJECT(ctz_table) 66 67 68define(`u0', `%eax') 69define(`v0', `%edx') 70 71ASM_START() 72 TEXT 73 ALIGN(16) 74PROLOGUE(mpn_gcd_11) 75 push %edi 76 push %esi 77 78 mov 12(%esp), %eax 79 mov 16(%esp), %edx 80 81 LEAL( ctz_table, %esi) 82 jmp L(odd) 83 84 ALIGN(16) C 85L(top): cmovc( %ecx, %eax) C u = |v - u| 86 cmovc( %edi, %edx) C v = min(u,v) 87L(mid): and $MASK, %ecx C 88 movzbl (%esi,%ecx), %ecx C 89 jz L(shift_alot) C 90 shr %cl, %eax C 91L(odd): mov %eax, %edi C 92 mov %edx, %ecx C 93 sub %eax, %ecx C 94 sub %edx, %eax C 95 jnz L(top) C 96 97L(end): mov %edx, %eax 98 pop %esi 99 pop %edi 100 ret 101 102L(shift_alot): 103 shr $MAXSHIFT, %eax 104 mov %eax, %ecx 105 jmp L(mid) 106EPILOGUE() 107ASM_END() 108