1dnl AMD K7 mpn_divexact_1 -- mpn by limb exact division. 2 3dnl Copyright 2001, 2002, 2004, 2007 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C cycles/limb 24C Athlon: 11.0 25C Hammer: 9.0 26 27 28C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 29C mp_limb_t divisor); 30C 31C The dependent chain is mul+imul+sub for 11 cycles and that speed is 32C achieved with no special effort. The load and shrld latencies are hidden 33C by out of order execution. 34C 35C It's a touch faster on size==1 to use the mul-by-inverse than divl. 36 37defframe(PARAM_DIVISOR,16) 38defframe(PARAM_SIZE, 12) 39defframe(PARAM_SRC, 8) 40defframe(PARAM_DST, 4) 41 42defframe(SAVE_EBX, -4) 43defframe(SAVE_ESI, -8) 44defframe(SAVE_EDI, -12) 45defframe(SAVE_EBP, -16) 46defframe(VAR_INVERSE, -20) 47defframe(VAR_DST_END, -24) 48 49deflit(STACK_SPACE, 24) 50 51 TEXT 52 53 ALIGN(16) 54PROLOGUE(mpn_divexact_1) 55deflit(`FRAME',0) 56 57 movl PARAM_DIVISOR, %eax 58 subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE) 59 movl $-1, %ecx C shift count 60 61 movl %ebp, SAVE_EBP 62 movl PARAM_SIZE, %ebp 63 64 movl %esi, SAVE_ESI 65 movl %edi, SAVE_EDI 66 67 C If there's usually only one or two trailing zero bits then this 68 C should be faster than bsfl. 69L(strip_twos): 70 incl %ecx 71 shrl %eax 72 jnc L(strip_twos) 73 74 movl %ebx, SAVE_EBX 75 leal 1(%eax,%eax), %ebx C d without twos 76 andl $127, %eax C d/2, 7 bits 77 78ifdef(`PIC',` 79 LEA( binvert_limb_table, %edx) 80 movzbl (%eax,%edx), %eax C inv 8 bits 81',` 82 movzbl binvert_limb_table(%eax), %eax C inv 8 bits 83') 84 85 leal (%eax,%eax), %edx C 2*inv 86 movl %ebx, PARAM_DIVISOR C d without twos 87 88 imull %eax, %eax C inv*inv 89 90 movl PARAM_SRC, %esi 91 movl PARAM_DST, %edi 92 93 imull %ebx, %eax C inv*inv*d 94 95 subl %eax, %edx C inv = 2*inv - inv*inv*d 96 leal (%edx,%edx), %eax C 2*inv 97 98 imull %edx, %edx C inv*inv 99 100 leal (%esi,%ebp,4), %esi C src end 101 leal (%edi,%ebp,4), %edi C dst end 102 negl %ebp C -size 103 104 imull %ebx, %edx C inv*inv*d 105 106 subl %edx, %eax C inv = 2*inv - inv*inv*d 107 108 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS 109 pushl %eax FRAME_pushl() 110 imull PARAM_DIVISOR, %eax 111 cmpl $1, %eax 112 popl %eax FRAME_popl()') 113 114 movl %eax, VAR_INVERSE 115 movl (%esi,%ebp,4), %eax C src[0] 116 117 incl %ebp 118 jz L(one) 119 120 movl (%esi,%ebp,4), %edx C src[1] 121 122 shrdl( %cl, %edx, %eax) 123 124 movl %edi, VAR_DST_END 125 xorl %ebx, %ebx 126 jmp L(entry) 127 128 ALIGN(8) 129L(top): 130 C eax q 131 C ebx carry bit, 0 or 1 132 C ecx shift 133 C edx 134 C esi src end 135 C edi dst end 136 C ebp counter, limbs, negative 137 138 mull PARAM_DIVISOR C carry limb in edx 139 140 movl -4(%esi,%ebp,4), %eax 141 movl (%esi,%ebp,4), %edi 142 143 shrdl( %cl, %edi, %eax) 144 145 subl %ebx, %eax C apply carry bit 146 setc %bl 147 movl VAR_DST_END, %edi 148 149 subl %edx, %eax C apply carry limb 150 adcl $0, %ebx 151 152L(entry): 153 imull VAR_INVERSE, %eax 154 155 movl %eax, -4(%edi,%ebp,4) 156 incl %ebp 157 jnz L(top) 158 159 160 mull PARAM_DIVISOR C carry limb in edx 161 162 movl -4(%esi), %eax C src high limb 163 shrl %cl, %eax 164 movl SAVE_ESI, %esi 165 166 subl %ebx, %eax C apply carry bit 167 movl SAVE_EBX, %ebx 168 movl SAVE_EBP, %ebp 169 170 subl %edx, %eax C apply carry limb 171 172 imull VAR_INVERSE, %eax 173 174 movl %eax, -4(%edi) 175 movl SAVE_EDI, %edi 176 addl $STACK_SPACE, %esp 177 178 ret 179 180 181L(one): 182 shrl %cl, %eax 183 movl SAVE_ESI, %esi 184 movl SAVE_EBX, %ebx 185 186 imull VAR_INVERSE, %eax 187 188 movl SAVE_EBP, %ebp 189 movl %eax, -4(%edi) 190 191 movl SAVE_EDI, %edi 192 addl $STACK_SPACE, %esp 193 194 ret 195 196EPILOGUE() 197