1dnl AMD K6 mpn_mul_1 -- mpn by limb multiply. 2 3dnl Copyright 1999, 2000, 2002, 2005 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C cycles/limb 35C P5 36C P6 model 0-8,10-12 5.5 37C P6 model 9 (Banias) 38C P6 model 13 (Dothan) 4.87 39C P4 model 0 (Willamette) 40C P4 model 1 (?) 41C P4 model 2 (Northwood) 42C P4 model 3 (Prescott) 43C P4 model 4 (Nocona) 44C AMD K6 6.25 45C AMD K7 46C AMD K8 47 48 49C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 50C mp_limb_t multiplier); 51C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, 52C mp_limb_t multiplier, mp_limb_t carry); 53C 54C Multiply src,size by mult and store the result in dst,size. 55C Return the carry limb from the top of the result. 56C 57C mpn_mul_1c() accepts an initial carry for the calculation, it's added into 58C the low limb of the result. 59 60defframe(PARAM_CARRY, 20) 61defframe(PARAM_MULTIPLIER,16) 62defframe(PARAM_SIZE, 12) 63defframe(PARAM_SRC, 8) 64defframe(PARAM_DST, 4) 65 66dnl minimum 5 because the unrolled code can't handle less 67deflit(UNROLL_THRESHOLD, 5) 68 69 TEXT 70 ALIGN(32) 71 72PROLOGUE(mpn_mul_1c) 73 pushl %esi 74deflit(`FRAME',4) 75 movl PARAM_CARRY, %esi 76 jmp L(start_nc) 77EPILOGUE() 78 79 80PROLOGUE(mpn_mul_1) 81 push %esi 82deflit(`FRAME',4) 83 xorl %esi, %esi C initial carry 84 85L(start_nc): 86 mov PARAM_SIZE, %ecx 87 push %ebx 88FRAME_pushl() 89 90 movl PARAM_SRC, %ebx 91 push %edi 92FRAME_pushl() 93 94 movl PARAM_DST, %edi 95 pushl %ebp 96FRAME_pushl() 97 98 cmpl $UNROLL_THRESHOLD, %ecx 99 movl PARAM_MULTIPLIER, %ebp 100 101 jae L(unroll) 102 103 104 C code offset 0x22 here, close enough to aligned 105L(simple): 106 C eax scratch 107 C ebx src 108 C ecx counter 109 C edx scratch 110 C esi carry 111 C edi dst 112 C ebp multiplier 113 C 114 C this loop 8 cycles/limb 115 116 movl (%ebx), %eax 117 addl $4, %ebx 118 119 mull %ebp 120 121 addl %esi, %eax 122 movl $0, %esi 123 124 adcl %edx, %esi 125 126 movl %eax, (%edi) 127 addl $4, %edi 128 129 loop L(simple) 130 131 132 popl %ebp 133 134 popl %edi 135 popl %ebx 136 137 movl %esi, %eax 138 popl %esi 139 140 ret 141 142 143C ----------------------------------------------------------------------------- 144C The code for each limb is 6 cycles, with instruction decoding being the 145C limiting factor. At 4 limbs/loop and 1 cycle/loop of overhead it's 6.25 146C cycles/limb in total. 147C 148C The secret ingredient to get 6.25 is to start the loop with the mul and 149C have the load/store pair at the end. Rotating the load/store to the top 150C is an 0.5 c/l slowdown. (Some address generation effect probably.) 151C 152C The whole unrolled loop fits nicely in exactly 80 bytes. 153 154 155 ALIGN(16) C already aligned to 16 here actually 156L(unroll): 157 movl (%ebx), %eax 158 leal -16(%ebx,%ecx,4), %ebx 159 160 leal -16(%edi,%ecx,4), %edi 161 subl $4, %ecx 162 163 negl %ecx 164 165 166 ALIGN(16) C one byte nop for this alignment 167L(top): 168 C eax scratch 169 C ebx &src[size-4] 170 C ecx counter 171 C edx scratch 172 C esi carry 173 C edi &dst[size-4] 174 C ebp multiplier 175 176 mull %ebp 177 178 addl %esi, %eax 179 movl $0, %esi 180 181 adcl %edx, %esi 182 183 movl %eax, (%edi,%ecx,4) 184 movl 4(%ebx,%ecx,4), %eax 185 186 187 mull %ebp 188 189 addl %esi, %eax 190 movl $0, %esi 191 192 adcl %edx, %esi 193 194 movl %eax, 4(%edi,%ecx,4) 195 movl 8(%ebx,%ecx,4), %eax 196 197 198 mull %ebp 199 200 addl %esi, %eax 201 movl $0, %esi 202 203 adcl %edx, %esi 204 205 movl %eax, 8(%edi,%ecx,4) 206 movl 12(%ebx,%ecx,4), %eax 207 208 209 mull %ebp 210 211 addl %esi, %eax 212 movl $0, %esi 213 214 adcl %edx, %esi 215 216 movl %eax, 12(%edi,%ecx,4) 217 movl 16(%ebx,%ecx,4), %eax 218 219 220 addl $4, %ecx 221 js L(top) 222 223 224 225 C eax next src limb 226 C ebx &src[size-4] 227 C ecx 0 to 3 representing respectively 4 to 1 further limbs 228 C edx 229 C esi carry 230 C edi &dst[size-4] 231 232 testb $2, %cl 233 jnz L(finish_not_two) 234 235 mull %ebp 236 237 addl %esi, %eax 238 movl $0, %esi 239 240 adcl %edx, %esi 241 242 movl %eax, (%edi,%ecx,4) 243 movl 4(%ebx,%ecx,4), %eax 244 245 246 mull %ebp 247 248 addl %esi, %eax 249 movl $0, %esi 250 251 adcl %edx, %esi 252 253 movl %eax, 4(%edi,%ecx,4) 254 movl 8(%ebx,%ecx,4), %eax 255 256 addl $2, %ecx 257L(finish_not_two): 258 259 260 testb $1, %cl 261 jnz L(finish_not_one) 262 263 mull %ebp 264 265 addl %esi, %eax 266 movl $0, %esi 267 268 adcl %edx, %esi 269 270 movl %eax, 8(%edi) 271 movl 12(%ebx), %eax 272L(finish_not_one): 273 274 275 mull %ebp 276 277 addl %esi, %eax 278 popl %ebp 279 280 adcl $0, %edx 281 282 movl %eax, 12(%edi) 283 popl %edi 284 285 popl %ebx 286 movl %edx, %eax 287 288 popl %esi 289 290 ret 291 292EPILOGUE() 293