dive_1.asm revision 1.1.1.1
1dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder. 2 3dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C odd even divisor 24C P6: 10.0 12.0 cycles/limb 25 26 27C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 28C mp_limb_t divisor); 29C 30C The odd case is basically the same as mpn_modexact_1_odd, just with an 31C extra store, and it runs at the same 10 cycles which is the dependent 32C chain. 33C 34C The shifts for the even case aren't on the dependent chain so in principle 35C it could run the same too, but nothing running at 10 has been found. 36C Perhaps there's too many uops (an extra 4 over the odd case). 37 38defframe(PARAM_DIVISOR,16) 39defframe(PARAM_SIZE, 12) 40defframe(PARAM_SRC, 8) 41defframe(PARAM_DST, 4) 42 43defframe(SAVE_EBX, -4) 44defframe(SAVE_ESI, -8) 45defframe(SAVE_EDI, -12) 46defframe(SAVE_EBP, -16) 47defframe(VAR_INVERSE, -20) 48deflit(STACK_SPACE, 20) 49 50 TEXT 51 52 ALIGN(16) 53PROLOGUE(mpn_divexact_1) 54deflit(`FRAME',0) 55 56 movl PARAM_DIVISOR, %eax 57 subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) 58 59 movl %esi, SAVE_ESI 60 movl PARAM_SRC, %esi 61 62 movl %ebx, SAVE_EBX 63 movl PARAM_SIZE, %ebx 64 65 bsfl %eax, %ecx C trailing twos 66 67 movl %ebp, SAVE_EBP 68 69 shrl %cl, %eax C d without twos 70 71 movl %eax, %edx 72 shrl %eax C d/2 without twos 73 74 movl %edx, PARAM_DIVISOR 75 andl $127, %eax 76 77ifdef(`PIC',` 78 LEA( binvert_limb_table, %ebp) 79 movzbl (%eax,%ebp), %ebp C inv 8 bits 80',` 81 movzbl binvert_limb_table(%eax), %ebp C inv 8 bits 82') 83 84 leal (%ebp,%ebp), %eax C 2*inv 85 86 imull %ebp, %ebp C inv*inv 87 88 movl %edi, SAVE_EDI 89 movl PARAM_DST, %edi 90 91 leal (%esi,%ebx,4), %esi C src end 92 93 imull PARAM_DIVISOR, %ebp C inv*inv*d 94 95 subl %ebp, %eax C inv = 2*inv - inv*inv*d 96 leal (%eax,%eax), %ebp C 2*inv 97 98 imull %eax, %eax C inv*inv 99 100 leal (%edi,%ebx,4), %edi C dst end 101 negl %ebx C -size 102 103 movl %edi, PARAM_DST 104 105 imull PARAM_DIVISOR, %eax C inv*inv*d 106 107 subl %eax, %ebp C inv = 2*inv - inv*inv*d 108 109 ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS 110 movl PARAM_DIVISOR, %eax 111 imull %ebp, %eax 112 cmpl $1, %eax') 113 114 movl %ebp, VAR_INVERSE 115 movl (%esi,%ebx,4), %eax C src[0] 116 117 orl %ecx, %ecx 118 jnz L(even) 119 120 C ecx initial carry is zero 121 jmp L(odd_entry) 122 123 124C The dependent chain here is 125C 126C subl %edx, %eax 1 127C imull %ebp, %eax 4 128C mull PARAM_DIVISOR 5 129C ---- 130C total 10 131C 132C and this is the measured speed. No special scheduling is necessary, out 133C of order execution hides the load latency. 134 135L(odd_top): 136 C eax scratch (src limb) 137 C ebx counter, limbs, negative 138 C ecx carry bit 139 C edx carry limb, high of last product 140 C esi &src[size] 141 C edi &dst[size] 142 C ebp 143 144 mull PARAM_DIVISOR 145 146 movl (%esi,%ebx,4), %eax 147 subl %ecx, %eax 148 149 sbbl %ecx, %ecx 150 subl %edx, %eax 151 152 sbbl $0, %ecx 153 154L(odd_entry): 155 imull VAR_INVERSE, %eax 156 157 movl %eax, (%edi,%ebx,4) 158 negl %ecx 159 160 incl %ebx 161 jnz L(odd_top) 162 163 164 movl SAVE_ESI, %esi 165 166 movl SAVE_EDI, %edi 167 168 movl SAVE_EBP, %ebp 169 170 movl SAVE_EBX, %ebx 171 addl $STACK_SPACE, %esp 172 173 ret 174 175 176L(even): 177 C eax src[0] 178 C ebx counter, limbs, negative 179 C ecx shift 180 C edx 181 C esi 182 C edi 183 C ebp 184 185 xorl %ebp, %ebp C initial carry bit 186 xorl %edx, %edx C initial carry limb (for size==1) 187 188 incl %ebx 189 jz L(even_one) 190 191 movl (%esi,%ebx,4), %edi C src[1] 192 193 shrdl( %cl, %edi, %eax) 194 195 jmp L(even_entry) 196 197 198L(even_top): 199 C eax scratch 200 C ebx counter, limbs, negative 201 C ecx shift 202 C edx scratch 203 C esi &src[size] 204 C edi &dst[size] and scratch 205 C ebp carry bit 206 207 movl (%esi,%ebx,4), %edi 208 209 mull PARAM_DIVISOR 210 211 movl -4(%esi,%ebx,4), %eax 212 shrdl( %cl, %edi, %eax) 213 214 subl %ebp, %eax 215 216 sbbl %ebp, %ebp 217 subl %edx, %eax 218 219 sbbl $0, %ebp 220 221L(even_entry): 222 imull VAR_INVERSE, %eax 223 224 movl PARAM_DST, %edi 225 negl %ebp 226 227 movl %eax, -4(%edi,%ebx,4) 228 incl %ebx 229 jnz L(even_top) 230 231 232 233 mull PARAM_DIVISOR 234 235 movl -4(%esi), %eax 236 237L(even_one): 238 shrl %cl, %eax 239 movl SAVE_ESI, %esi 240 241 subl %ebp, %eax 242 movl SAVE_EBP, %ebp 243 244 subl %edx, %eax 245 movl SAVE_EBX, %ebx 246 247 imull VAR_INVERSE, %eax 248 249 movl %eax, -4(%edi) 250 movl SAVE_EDI, %edi 251 addl $STACK_SPACE, %esp 252 253 ret 254 255EPILOGUE() 256