1dnl AMD64 mpn_mul_basecase. 2 3dnl Contributed to the GNU project by Torbjorn Granlund and David Harvey. 4 5dnl Copyright 2008 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C cycles/limb 25C K8,K9: 2.375 26C K10: 2.375 27C P4: ? 28C P6-15: 4.45 29 30C The inner loops of this code are the result of running a code generation and 31C optimization tool suite written by David Harvey and Torbjorn Granlund. 32 33C TODO 34C * Use fewer registers. (how??? I can't see it -- david) 35C * Avoid some "mov $0,r" and instead use "xor r,r". 36C * Can the top of each L(addmul_outer_n) prologue be folded into the 37C mul_1/mul_2 prologues, saving a LEA (%rip)? It would slow down the 38C case where vn = 1 or 2; is it worth it? 39 40C INPUT PARAMETERS 41define(`rp', `%rdi') 42define(`up', `%rsi') 43define(`un_param',`%rdx') 44define(`vp', `%rcx') 45define(`vn', `%r8') 46 47define(`v0', `%r12') 48define(`v1', `%r9') 49 50define(`w0', `%rbx') 51define(`w1', `%r15') 52define(`w2', `%rbp') 53define(`w3', `%r10') 54 55define(`n', `%r11') 56define(`outer_addr', `%r14') 57define(`un', `%r13') 58 59ASM_START() 60 TEXT 61 ALIGN(16) 62PROLOGUE(mpn_mul_basecase) 63 push %rbx 64 push %rbp 65 push %r12 66 push %r13 67 push %r14 68 push %r15 69 70 xor R32(un), R32(un) 71 mov (up), %rax 72 mov (vp), v0 73 74 sub un_param, un C rdx used by mul 75 mov un, n 76 mov R32(un_param), R32(w0) 77 78 lea (rp,un_param,8), rp 79 lea (up,un_param,8), up 80 81 mul v0 82 83 test $1, R8(vn) 84 jz L(mul_2) 85 86C =========================================================== 87C mul_1 for vp[0] if vn is odd 88 89L(mul_1): 90 and $3, R32(w0) 91 jz L(mul_1_prologue_0) 92 cmp $2, R32(w0) 93 jc L(mul_1_prologue_1) 94 jz L(mul_1_prologue_2) 95 jmp L(mul_1_prologue_3) 96 97L(mul_1_prologue_0): 98 mov %rax, w2 99 mov %rdx, w3 C note: already w0 == 0 100 lea L(addmul_outer_0)(%rip), outer_addr 101 jmp L(mul_1_entry_0) 102 103L(mul_1_prologue_1): 104 cmp $-1, un 105 jne 2f 106 mov %rax, -8(rp) 107 mov %rdx, (rp) 108 jmp L(ret) 1092: add $1, n 110 lea L(addmul_outer_1)(%rip), outer_addr 111 mov %rax, w1 112 mov %rdx, w2 113 xor R32(w3), R32(w3) 114 mov (up,n,8), %rax 115 jmp L(mul_1_entry_1) 116 117L(mul_1_prologue_2): 118 add $-2, n 119 lea L(addmul_outer_2)(%rip), outer_addr 120 mov %rax, w0 121 mov %rdx, w1 122 mov 24(up,n,8), %rax 123 xor R32(w2), R32(w2) 124 xor R32(w3), R32(w3) 125 jmp L(mul_1_entry_2) 126 127L(mul_1_prologue_3): 128 add $-1, n 129 lea L(addmul_outer_3)(%rip), outer_addr 130 mov %rax, w3 131 mov %rdx, w0 132 jmp L(mul_1_entry_3) 133 134 135 C this loop is 10 c/loop = 2.5 c/l on K8, for all up/rp alignments 136 137 ALIGN(16) 138L(mul_1_top): 139 mov w0, -16(rp,n,8) 140 add %rax, w1 141 mov (up,n,8), %rax 142 adc %rdx, w2 143L(mul_1_entry_1): 144 xor R32(w0), R32(w0) 145 mul v0 146 mov w1, -8(rp,n,8) 147 add %rax, w2 148 adc %rdx, w3 149L(mul_1_entry_0): 150 mov 8(up,n,8), %rax 151 mul v0 152 mov w2, (rp,n,8) 153 add %rax, w3 154 adc %rdx, w0 155L(mul_1_entry_3): 156 mov 16(up,n,8), %rax 157 mul v0 158 mov w3, 8(rp,n,8) 159 xor R32(w2), R32(w2) C zero 160 mov w2, w3 C zero 161 add %rax, w0 162 mov 24(up,n,8), %rax 163 mov w2, w1 C zero 164 adc %rdx, w1 165L(mul_1_entry_2): 166 mul v0 167 add $4, n 168 js L(mul_1_top) 169 170 mov w0, -16(rp) 171 add %rax, w1 172 mov w1, -8(rp) 173 adc %rdx, w2 174 mov w2, (rp) 175 176 add $-1, vn C vn -= 1 177 jz L(ret) 178 179 mov 8(vp), v0 180 mov 16(vp), v1 181 182 lea 8(vp), vp C vp += 1 183 lea 8(rp), rp C rp += 1 184 185 jmp *outer_addr 186 187C =========================================================== 188C mul_2 for vp[0], vp[1] if vn is even 189 190 ALIGN(16) 191L(mul_2): 192 mov 8(vp), v1 193 194 and $3, R32(w0) 195 jz L(mul_2_prologue_0) 196 cmp $2, R32(w0) 197 jz L(mul_2_prologue_2) 198 jc L(mul_2_prologue_1) 199 200L(mul_2_prologue_3): 201 lea L(addmul_outer_3)(%rip), outer_addr 202 add $2, n 203 mov %rax, -16(rp,n,8) 204 mov %rdx, w2 205 xor R32(w3), R32(w3) 206 xor R32(w0), R32(w0) 207 mov -16(up,n,8), %rax 208 jmp L(mul_2_entry_3) 209 210 ALIGN(16) 211L(mul_2_prologue_0): 212 add $3, n 213 mov %rax, w0 214 mov %rdx, w1 215 xor R32(w2), R32(w2) 216 mov -24(up,n,8), %rax 217 lea L(addmul_outer_0)(%rip), outer_addr 218 jmp L(mul_2_entry_0) 219 220 ALIGN(16) 221L(mul_2_prologue_1): 222 mov %rax, w3 223 mov %rdx, w0 224 xor R32(w1), R32(w1) 225 lea L(addmul_outer_1)(%rip), outer_addr 226 jmp L(mul_2_entry_1) 227 228 ALIGN(16) 229L(mul_2_prologue_2): 230 add $1, n 231 lea L(addmul_outer_2)(%rip), outer_addr 232 mov $0, R32(w0) 233 mov $0, R32(w1) 234 mov %rax, w2 235 mov -8(up,n,8), %rax 236 mov %rdx, w3 237 jmp L(mul_2_entry_2) 238 239 C this loop is 18 c/loop = 2.25 c/l on K8, for all up/rp alignments 240 241 ALIGN(16) 242L(mul_2_top): 243 mov -32(up,n,8), %rax 244 mul v1 245 add %rax, w0 246 adc %rdx, w1 247 mov -24(up,n,8), %rax 248 xor R32(w2), R32(w2) 249 mul v0 250 add %rax, w0 251 mov -24(up,n,8), %rax 252 adc %rdx, w1 253 adc $0, R32(w2) 254L(mul_2_entry_0): 255 mul v1 256 add %rax, w1 257 mov w0, -24(rp,n,8) 258 adc %rdx, w2 259 mov -16(up,n,8), %rax 260 mul v0 261 mov $0, R32(w3) 262 add %rax, w1 263 adc %rdx, w2 264 mov -16(up,n,8), %rax 265 adc $0, R32(w3) 266 mov $0, R32(w0) 267 mov w1, -16(rp,n,8) 268L(mul_2_entry_3): 269 mul v1 270 add %rax, w2 271 mov -8(up,n,8), %rax 272 adc %rdx, w3 273 mov $0, R32(w1) 274 mul v0 275 add %rax, w2 276 mov -8(up,n,8), %rax 277 adc %rdx, w3 278 adc R32(w1), R32(w0) C adc $0, w0 279L(mul_2_entry_2): 280 mul v1 281 add %rax, w3 282 mov w2, -8(rp,n,8) 283 adc %rdx, w0 284 mov (up,n,8), %rax 285 mul v0 286 add %rax, w3 287 adc %rdx, w0 288 adc $0, R32(w1) 289L(mul_2_entry_1): 290 add $4, n 291 mov w3, -32(rp,n,8) 292 js L(mul_2_top) 293 294 mov -32(up,n,8), %rax 295 mul v1 296 add %rax, w0 297 mov w0, (rp) 298 adc %rdx, w1 299 mov w1, 8(rp) 300 301 add $-2, vn C vn -= 2 302 jz L(ret) 303 304 mov 16(vp), v0 305 mov 24(vp), v1 306 307 lea 16(vp), vp C vp += 2 308 lea 16(rp), rp C rp += 2 309 310 jmp *outer_addr 311 312 313C =========================================================== 314C addmul_2 for remaining vp's 315 316 C in the following prologues, we reuse un to store the 317 C adjusted value of n that is reloaded on each iteration 318 319L(addmul_outer_0): 320 add $3, un 321 lea 0(%rip), outer_addr 322 323 mov un, n 324 mov -24(up,un,8), %rax 325 mul v0 326 mov %rax, w0 327 mov -24(up,un,8), %rax 328 mov %rdx, w1 329 xor R32(w2), R32(w2) 330 jmp L(addmul_entry_0) 331 332L(addmul_outer_1): 333 mov un, n 334 mov (up,un,8), %rax 335 mul v0 336 mov %rax, w3 337 mov (up,un,8), %rax 338 mov %rdx, w0 339 xor R32(w1), R32(w1) 340 jmp L(addmul_entry_1) 341 342L(addmul_outer_2): 343 add $1, un 344 lea 0(%rip), outer_addr 345 346 mov un, n 347 mov -8(up,un,8), %rax 348 mul v0 349 xor R32(w0), R32(w0) 350 mov %rax, w2 351 xor R32(w1), R32(w1) 352 mov %rdx, w3 353 mov -8(up,un,8), %rax 354 jmp L(addmul_entry_2) 355 356L(addmul_outer_3): 357 add $2, un 358 lea 0(%rip), outer_addr 359 360 mov un, n 361 mov -16(up,un,8), %rax 362 xor R32(w3), R32(w3) 363 mul v0 364 mov %rax, w1 365 mov -16(up,un,8), %rax 366 mov %rdx, w2 367 jmp L(addmul_entry_3) 368 369 C this loop is 19 c/loop = 2.375 c/l on K8, for all up/rp alignments 370 371 ALIGN(16) 372L(addmul_top): 373 add w3, -32(rp,n,8) 374 adc %rax, w0 375 mov -24(up,n,8), %rax 376 adc %rdx, w1 377 xor R32(w2), R32(w2) 378 mul v0 379 add %rax, w0 380 mov -24(up,n,8), %rax 381 adc %rdx, w1 382 adc R32(w2), R32(w2) C adc $0, w2 383L(addmul_entry_0): 384 mul v1 385 xor R32(w3), R32(w3) 386 add w0, -24(rp,n,8) 387 adc %rax, w1 388 mov -16(up,n,8), %rax 389 adc %rdx, w2 390 mul v0 391 add %rax, w1 392 mov -16(up,n,8), %rax 393 adc %rdx, w2 394 adc $0, R32(w3) 395L(addmul_entry_3): 396 mul v1 397 add w1, -16(rp,n,8) 398 adc %rax, w2 399 mov -8(up,n,8), %rax 400 adc %rdx, w3 401 mul v0 402 xor R32(w0), R32(w0) 403 add %rax, w2 404 adc %rdx, w3 405 mov $0, R32(w1) 406 mov -8(up,n,8), %rax 407 adc R32(w1), R32(w0) C adc $0, w0 408L(addmul_entry_2): 409 mul v1 410 add w2, -8(rp,n,8) 411 adc %rax, w3 412 adc %rdx, w0 413 mov (up,n,8), %rax 414 mul v0 415 add %rax, w3 416 mov (up,n,8), %rax 417 adc %rdx, w0 418 adc $0, R32(w1) 419L(addmul_entry_1): 420 mul v1 421 add $4, n 422 js L(addmul_top) 423 424 add w3, -8(rp) 425 adc %rax, w0 426 mov w0, (rp) 427 adc %rdx, w1 428 mov w1, 8(rp) 429 430 add $-2, vn C vn -= 2 431 jz L(ret) 432 433 lea 16(rp), rp C rp += 2 434 lea 16(vp), vp C vp += 2 435 436 mov (vp), v0 437 mov 8(vp), v1 438 439 jmp *outer_addr 440 441 ALIGN(16) 442L(ret): pop %r15 443 pop %r14 444 pop %r13 445 pop %r12 446 pop %rbp 447 pop %rbx 448 ret 449 450EPILOGUE() 451