bdiv_q_1.asm revision 1.1.1.1
1dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder. 2 3dnl Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl Rearranged from mpn/x86/p6/dive_1.asm by Marco Bodrato. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or 10dnl modify it under the terms of the GNU Lesser General Public License as 11dnl published by the Free Software Foundation; either version 3 of the 12dnl License, or (at your option) any later version. 13dnl 14dnl The GNU MP Library is distributed in the hope that it will be useful, 15dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 16dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17dnl Lesser General Public License for more details. 18dnl 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24 25C odd even divisor 26C P6: 10.0 12.0 cycles/limb 27 28C MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) 29 30C The odd case is basically the same as mpn_modexact_1_odd, just with an 31C extra store, and it runs at the same 10 cycles which is the dependent 32C chain. 33C 34C The shifts for the even case aren't on the dependent chain so in principle 35C it could run the same too, but nothing running at 10 has been found. 36C Perhaps there's too many uops (an extra 4 over the odd case). 37 38defframe(PARAM_SHIFT, 24) 39defframe(PARAM_INVERSE,20) 40defframe(PARAM_DIVISOR,16) 41defframe(PARAM_SIZE, 12) 42defframe(PARAM_SRC, 8) 43defframe(PARAM_DST, 4) 44 45defframe(SAVE_EBX, -4) 46defframe(SAVE_ESI, -8) 47defframe(SAVE_EDI, -12) 48defframe(SAVE_EBP, -16) 49deflit(STACK_SPACE, 16) 50 51dnl re-use parameter space 52define(VAR_INVERSE,`PARAM_SRC') 53 54 TEXT 55 56C mp_limb_t 57C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor, 58C mp_limb_t inverse, int shift) 59 60 ALIGN(16) 61PROLOGUE(mpn_pi1_bdiv_q_1) 62deflit(`FRAME',0) 63 64 subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) 65 66 movl %esi, SAVE_ESI 67 movl PARAM_SRC, %esi 68 69 movl %ebx, SAVE_EBX 70 movl PARAM_SIZE, %ebx 71 72 movl %ebp, SAVE_EBP 73 movl PARAM_INVERSE, %ebp 74 75 movl PARAM_SHIFT, %ecx C trailing twos 76 77L(common): 78 movl %edi, SAVE_EDI 79 movl PARAM_DST, %edi 80 81 leal (%esi,%ebx,4), %esi C src end 82 83 leal (%edi,%ebx,4), %edi C dst end 84 negl %ebx C -size 85 86 movl (%esi,%ebx,4), %eax C src[0] 87 88 orl %ecx, %ecx 89 jz L(odd_entry) 90 91 movl %edi, PARAM_DST 92 movl %ebp, VAR_INVERSE 93 94L(even): 95 C eax src[0] 96 C ebx counter, limbs, negative 97 C ecx shift 98 C edx 99 C esi 100 C edi 101 C ebp 102 103 xorl %ebp, %ebp C initial carry bit 104 xorl %edx, %edx C initial carry limb (for size==1) 105 106 incl %ebx 107 jz L(even_one) 108 109 movl (%esi,%ebx,4), %edi C src[1] 110 111 shrdl( %cl, %edi, %eax) 112 113 jmp L(even_entry) 114 115 116L(even_top): 117 C eax scratch 118 C ebx counter, limbs, negative 119 C ecx shift 120 C edx scratch 121 C esi &src[size] 122 C edi &dst[size] and scratch 123 C ebp carry bit 124 125 movl (%esi,%ebx,4), %edi 126 127 mull PARAM_DIVISOR 128 129 movl -4(%esi,%ebx,4), %eax 130 shrdl( %cl, %edi, %eax) 131 132 subl %ebp, %eax 133 134 sbbl %ebp, %ebp 135 subl %edx, %eax 136 137 sbbl $0, %ebp 138 139L(even_entry): 140 imull VAR_INVERSE, %eax 141 142 movl PARAM_DST, %edi 143 negl %ebp 144 145 movl %eax, -4(%edi,%ebx,4) 146 incl %ebx 147 jnz L(even_top) 148 149 mull PARAM_DIVISOR 150 151 movl -4(%esi), %eax 152 153L(even_one): 154 shrl %cl, %eax 155 movl SAVE_ESI, %esi 156 157 subl %ebp, %eax 158 movl SAVE_EBP, %ebp 159 160 subl %edx, %eax 161 movl SAVE_EBX, %ebx 162 163 imull VAR_INVERSE, %eax 164 165 movl %eax, -4(%edi) 166 movl SAVE_EDI, %edi 167 addl $STACK_SPACE, %esp 168 169 ret 170 171C The dependent chain here is 172C 173C subl %edx, %eax 1 174C imull %ebp, %eax 4 175C mull PARAM_DIVISOR 5 176C ---- 177C total 10 178C 179C and this is the measured speed. No special scheduling is necessary, out 180C of order execution hides the load latency. 181 182L(odd_top): 183 C eax scratch (src limb) 184 C ebx counter, limbs, negative 185 C ecx carry bit 186 C edx carry limb, high of last product 187 C esi &src[size] 188 C edi &dst[size] 189 C ebp inverse 190 191 mull PARAM_DIVISOR 192 193 movl (%esi,%ebx,4), %eax 194 subl %ecx, %eax 195 196 sbbl %ecx, %ecx 197 subl %edx, %eax 198 199 sbbl $0, %ecx 200 201L(odd_entry): 202 imull %ebp, %eax 203 204 movl %eax, (%edi,%ebx,4) 205 negl %ecx 206 207 incl %ebx 208 jnz L(odd_top) 209 210 211 movl SAVE_ESI, %esi 212 213 movl SAVE_EDI, %edi 214 215 movl SAVE_EBP, %ebp 216 217 movl SAVE_EBX, %ebx 218 addl $STACK_SPACE, %esp 219 220 ret 221 222EPILOGUE() 223 224C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 225C mp_limb_t divisor); 226C 227 228 ALIGN(16) 229PROLOGUE(mpn_bdiv_q_1) 230deflit(`FRAME',0) 231 232 movl PARAM_DIVISOR, %eax 233 subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) 234 235 movl %esi, SAVE_ESI 236 movl PARAM_SRC, %esi 237 238 movl %ebx, SAVE_EBX 239 movl PARAM_SIZE, %ebx 240 241 bsfl %eax, %ecx C trailing twos 242 243 movl %ebp, SAVE_EBP 244 245 shrl %cl, %eax C d without twos 246 247 movl %eax, %edx 248 shrl %eax C d/2 without twos 249 250 movl %edx, PARAM_DIVISOR 251 andl $127, %eax 252 253ifdef(`PIC',` 254 LEA( binvert_limb_table, %ebp) 255 movzbl (%eax,%ebp), %ebp C inv 8 bits 256',` 257 movzbl binvert_limb_table(%eax), %ebp C inv 8 bits 258') 259 260 leal (%ebp,%ebp), %eax C 2*inv 261 262 imull %ebp, %ebp C inv*inv 263 imull %edx, %ebp C inv*inv*d 264 265 subl %ebp, %eax C inv = 2*inv - inv*inv*d 266 leal (%eax,%eax), %ebp C 2*inv 267 268 imull %eax, %eax C inv*inv 269 imull %edx, %eax C inv*inv*d 270 271 subl %eax, %ebp C inv = 2*inv - inv*inv*d 272 273 jmp L(common) 274 275EPILOGUE() 276