aorsmul_1.asm revision 1.1.1.1
1dnl Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple. 2 3dnl Copyright 1999, 2000, 2001, 2002, 2005 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C cycles/limb 24C P5: 25C P6 model 0-8,10-12) 6.44 26C P6 model 9 (Banias) 27C P6 model 13 (Dothan) 6.11 28C P4 model 0 (Willamette) 29C P4 model 1 (?) 30C P4 model 2 (Northwood) 31C P4 model 3 (Prescott) 32C P4 model 4 (Nocona) 33C K6: 34C K7: 35C K8: 36 37 38dnl P6 UNROLL_COUNT cycles/limb 39dnl 8 6.7 40dnl 16 6.35 41dnl 32 6.3 42dnl 64 6.3 43dnl Maximum possible with the current code is 64. 44 45deflit(UNROLL_COUNT, 16) 46 47 48ifdef(`OPERATION_addmul_1', ` 49 define(M4_inst, addl) 50 define(M4_function_1, mpn_addmul_1) 51 define(M4_function_1c, mpn_addmul_1c) 52 define(M4_description, add it to) 53 define(M4_desc_retval, carry) 54',`ifdef(`OPERATION_submul_1', ` 55 define(M4_inst, subl) 56 define(M4_function_1, mpn_submul_1) 57 define(M4_function_1c, mpn_submul_1c) 58 define(M4_description, subtract it from) 59 define(M4_desc_retval, borrow) 60',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 61')')') 62 63MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) 64 65 66C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 67C mp_limb_t mult); 68C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, 69C mp_limb_t mult, mp_limb_t carry); 70C 71C Calculate src,size multiplied by mult and M4_description dst,size. 72C Return the M4_desc_retval limb from the top of the result. 73C 74C This code is pretty much the same as the K6 code. The unrolled loop is 75C the same, but there's just a few scheduling tweaks in the setups and the 76C simple loop. 77C 78C A number of variations have been tried for the unrolled loop, with one or 79C two carries, and with loads scheduled earlier, but nothing faster than 6 80C cycles/limb has been found. 81 82ifdef(`PIC',` 83deflit(UNROLL_THRESHOLD, 5) 84',` 85deflit(UNROLL_THRESHOLD, 5) 86') 87 88defframe(PARAM_CARRY, 20) 89defframe(PARAM_MULTIPLIER,16) 90defframe(PARAM_SIZE, 12) 91defframe(PARAM_SRC, 8) 92defframe(PARAM_DST, 4) 93 94 TEXT 95 ALIGN(32) 96 97PROLOGUE(M4_function_1c) 98 pushl %ebx 99deflit(`FRAME',4) 100 movl PARAM_CARRY, %ebx 101 jmp L(start_nc) 102EPILOGUE() 103 104PROLOGUE(M4_function_1) 105 push %ebx 106deflit(`FRAME',4) 107 xorl %ebx, %ebx C initial carry 108 109L(start_nc): 110 movl PARAM_SIZE, %ecx 111 pushl %esi 112deflit(`FRAME',8) 113 114 movl PARAM_SRC, %esi 115 pushl %edi 116deflit(`FRAME',12) 117 118 movl PARAM_DST, %edi 119 pushl %ebp 120deflit(`FRAME',16) 121 cmpl $UNROLL_THRESHOLD, %ecx 122 123 movl PARAM_MULTIPLIER, %ebp 124 jae L(unroll) 125 126 127 C simple loop 128 C this is offset 0x22, so close enough to aligned 129L(simple): 130 C eax scratch 131 C ebx carry 132 C ecx counter 133 C edx scratch 134 C esi src 135 C edi dst 136 C ebp multiplier 137 138 movl (%esi), %eax 139 addl $4, %edi 140 141 mull %ebp 142 143 addl %ebx, %eax 144 adcl $0, %edx 145 146 M4_inst %eax, -4(%edi) 147 movl %edx, %ebx 148 149 adcl $0, %ebx 150 decl %ecx 151 152 leal 4(%esi), %esi 153 jnz L(simple) 154 155 156 popl %ebp 157 popl %edi 158 159 popl %esi 160 movl %ebx, %eax 161 162 popl %ebx 163 ret 164 165 166 167C------------------------------------------------------------------------------ 168C VAR_JUMP holds the computed jump temporarily because there's not enough 169C registers when doing the mul for the initial two carry limbs. 170C 171C The add/adc for the initial carry in %ebx is necessary only for the 172C mpn_add/submul_1c entry points. Duplicating the startup code to 173C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good 174C idea. 175 176dnl overlapping with parameters already fetched 177define(VAR_COUNTER,`PARAM_SIZE') 178define(VAR_JUMP, `PARAM_DST') 179 180 C this is offset 0x43, so close enough to aligned 181L(unroll): 182 C eax 183 C ebx initial carry 184 C ecx size 185 C edx 186 C esi src 187 C edi dst 188 C ebp 189 190 movl %ecx, %edx 191 decl %ecx 192 193 subl $2, %edx 194 negl %ecx 195 196 shrl $UNROLL_LOG2, %edx 197 andl $UNROLL_MASK, %ecx 198 199 movl %edx, VAR_COUNTER 200 movl %ecx, %edx 201 202 C 15 code bytes per limb 203ifdef(`PIC',` 204 call L(pic_calc) 205L(here): 206',` 207 shll $4, %edx 208 negl %ecx 209 210 leal L(entry) (%edx,%ecx,1), %edx 211') 212 movl (%esi), %eax C src low limb 213 214 movl %edx, VAR_JUMP 215 leal ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi 216 217 mull %ebp 218 219 addl %ebx, %eax C initial carry (from _1c) 220 adcl $0, %edx 221 222 movl %edx, %ebx C high carry 223 leal ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi 224 225 movl VAR_JUMP, %edx 226 testl $1, %ecx 227 movl %eax, %ecx C low carry 228 229 cmovnz( %ebx, %ecx) C high,low carry other way around 230 cmovnz( %eax, %ebx) 231 232 jmp *%edx 233 234 235ifdef(`PIC',` 236L(pic_calc): 237 shll $4, %edx 238 negl %ecx 239 240 C See mpn/x86/README about old gas bugs 241 leal (%edx,%ecx,1), %edx 242 addl $L(entry)-L(here), %edx 243 244 addl (%esp), %edx 245 246 ret_internal 247') 248 249 250C ----------------------------------------------------------- 251 ALIGN(32) 252L(top): 253deflit(`FRAME',16) 254 C eax scratch 255 C ebx carry hi 256 C ecx carry lo 257 C edx scratch 258 C esi src 259 C edi dst 260 C ebp multiplier 261 C 262 C VAR_COUNTER loop counter 263 C 264 C 15 code bytes per limb 265 266 addl $UNROLL_BYTES, %edi 267 268L(entry): 269deflit(CHUNK_COUNT,2) 270forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` 271 deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128))) 272 deflit(`disp1', eval(disp0 + 4)) 273 274Zdisp( movl, disp0,(%esi), %eax) 275 mull %ebp 276Zdisp( M4_inst,%ecx, disp0,(%edi)) 277 adcl %eax, %ebx 278 movl %edx, %ecx 279 adcl $0, %ecx 280 281 movl disp1(%esi), %eax 282 mull %ebp 283 M4_inst %ebx, disp1(%edi) 284 adcl %eax, %ecx 285 movl %edx, %ebx 286 adcl $0, %ebx 287') 288 289 decl VAR_COUNTER 290 leal UNROLL_BYTES(%esi), %esi 291 292 jns L(top) 293 294 295deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128))) 296 297 M4_inst %ecx, disp0(%edi) 298 movl %ebx, %eax 299 300 popl %ebp 301 popl %edi 302 303 popl %esi 304 popl %ebx 305 adcl $0, %eax 306 307 ret 308 309EPILOGUE() 310