1dnl AMD K6 mpn_mul_1 -- mpn by limb multiply. 2 3dnl Copyright 1999, 2000, 2002, 2005 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C cycles/limb 24C P5: 25C P6 model 0-8,10-12) 5.5 26C P6 model 9 (Banias) 27C P6 model 13 (Dothan) 4.87 28C P4 model 0 (Willamette) 29C P4 model 1 (?) 30C P4 model 2 (Northwood) 31C P4 model 3 (Prescott) 32C P4 model 4 (Nocona) 33C K6: 6.25 34C K7: 35C K8: 36 37 38C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 39C mp_limb_t multiplier); 40C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, 41C mp_limb_t multiplier, mp_limb_t carry); 42C 43C Multiply src,size by mult and store the result in dst,size. 44C Return the carry limb from the top of the result. 45C 46C mpn_mul_1c() accepts an initial carry for the calculation, it's added into 47C the low limb of the result. 48 49defframe(PARAM_CARRY, 20) 50defframe(PARAM_MULTIPLIER,16) 51defframe(PARAM_SIZE, 12) 52defframe(PARAM_SRC, 8) 53defframe(PARAM_DST, 4) 54 55dnl minimum 5 because the unrolled code can't handle less 56deflit(UNROLL_THRESHOLD, 5) 57 58 TEXT 59 ALIGN(32) 60 61PROLOGUE(mpn_mul_1c) 62 pushl %esi 63deflit(`FRAME',4) 64 movl PARAM_CARRY, %esi 65 jmp L(start_nc) 66EPILOGUE() 67 68 69PROLOGUE(mpn_mul_1) 70 push %esi 71deflit(`FRAME',4) 72 xorl %esi, %esi C initial carry 73 74L(start_nc): 75 mov PARAM_SIZE, %ecx 76 push %ebx 77FRAME_pushl() 78 79 movl PARAM_SRC, %ebx 80 push %edi 81FRAME_pushl() 82 83 movl PARAM_DST, %edi 84 pushl %ebp 85FRAME_pushl() 86 87 cmpl $UNROLL_THRESHOLD, %ecx 88 movl PARAM_MULTIPLIER, %ebp 89 90 jae L(unroll) 91 92 93 C code offset 0x22 here, close enough to aligned 94L(simple): 95 C eax scratch 96 C ebx src 97 C ecx counter 98 C edx scratch 99 C esi carry 100 C edi dst 101 C ebp multiplier 102 C 103 C this loop 8 cycles/limb 104 105 movl (%ebx), %eax 106 addl $4, %ebx 107 108 mull %ebp 109 110 addl %esi, %eax 111 movl $0, %esi 112 113 adcl %edx, %esi 114 115 movl %eax, (%edi) 116 addl $4, %edi 117 118 loop L(simple) 119 120 121 popl %ebp 122 123 popl %edi 124 popl %ebx 125 126 movl %esi, %eax 127 popl %esi 128 129 ret 130 131 132C ----------------------------------------------------------------------------- 133C The code for each limb is 6 cycles, with instruction decoding being the 134C limiting factor. At 4 limbs/loop and 1 cycle/loop of overhead it's 6.25 135C cycles/limb in total. 136C 137C The secret ingredient to get 6.25 is to start the loop with the mul and 138C have the load/store pair at the end. Rotating the load/store to the top 139C is an 0.5 c/l slowdown. (Some address generation effect probably.) 140C 141C The whole unrolled loop fits nicely in exactly 80 bytes. 142 143 144 ALIGN(16) C already aligned to 16 here actually 145L(unroll): 146 movl (%ebx), %eax 147 leal -16(%ebx,%ecx,4), %ebx 148 149 leal -16(%edi,%ecx,4), %edi 150 subl $4, %ecx 151 152 negl %ecx 153 154 155 ALIGN(16) C one byte nop for this alignment 156L(top): 157 C eax scratch 158 C ebx &src[size-4] 159 C ecx counter 160 C edx scratch 161 C esi carry 162 C edi &dst[size-4] 163 C ebp multiplier 164 165 mull %ebp 166 167 addl %esi, %eax 168 movl $0, %esi 169 170 adcl %edx, %esi 171 172 movl %eax, (%edi,%ecx,4) 173 movl 4(%ebx,%ecx,4), %eax 174 175 176 mull %ebp 177 178 addl %esi, %eax 179 movl $0, %esi 180 181 adcl %edx, %esi 182 183 movl %eax, 4(%edi,%ecx,4) 184 movl 8(%ebx,%ecx,4), %eax 185 186 187 mull %ebp 188 189 addl %esi, %eax 190 movl $0, %esi 191 192 adcl %edx, %esi 193 194 movl %eax, 8(%edi,%ecx,4) 195 movl 12(%ebx,%ecx,4), %eax 196 197 198 mull %ebp 199 200 addl %esi, %eax 201 movl $0, %esi 202 203 adcl %edx, %esi 204 205 movl %eax, 12(%edi,%ecx,4) 206 movl 16(%ebx,%ecx,4), %eax 207 208 209 addl $4, %ecx 210 js L(top) 211 212 213 214 C eax next src limb 215 C ebx &src[size-4] 216 C ecx 0 to 3 representing respectively 4 to 1 further limbs 217 C edx 218 C esi carry 219 C edi &dst[size-4] 220 221 testb $2, %cl 222 jnz L(finish_not_two) 223 224 mull %ebp 225 226 addl %esi, %eax 227 movl $0, %esi 228 229 adcl %edx, %esi 230 231 movl %eax, (%edi,%ecx,4) 232 movl 4(%ebx,%ecx,4), %eax 233 234 235 mull %ebp 236 237 addl %esi, %eax 238 movl $0, %esi 239 240 adcl %edx, %esi 241 242 movl %eax, 4(%edi,%ecx,4) 243 movl 8(%ebx,%ecx,4), %eax 244 245 addl $2, %ecx 246L(finish_not_two): 247 248 249 testb $1, %cl 250 jnz L(finish_not_one) 251 252 mull %ebp 253 254 addl %esi, %eax 255 movl $0, %esi 256 257 adcl %edx, %esi 258 259 movl %eax, 8(%edi) 260 movl 12(%ebx), %eax 261L(finish_not_one): 262 263 264 mull %ebp 265 266 addl %esi, %eax 267 popl %ebp 268 269 adcl $0, %edx 270 271 movl %eax, 12(%edi) 272 popl %edi 273 274 popl %ebx 275 movl %edx, %eax 276 277 popl %esi 278 279 ret 280 281EPILOGUE() 282