1dnl X64-64 mpn_mullo_basecase optimised for AMD Zen. 2 3dnl Contributed to the GNU project by Torbjorn Granlund. 4 5dnl Copyright 2017 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C The inner loops of this code are the result of running a code generation and 36C optimisation tool suite written by David Harvey and Torbjorn Granlund. 37 38define(`rp', `%rdi') 39define(`up', `%rsi') 40define(`vp_param', `%rdx') 41define(`n', `%rcx') 42 43define(`vp', `%r11') 44define(`nn', `%rbp') 45 46C TODO 47C * Rearrange feed-in jumps for short branch forms. 48C * Roll out the heavy artillery and 4-way unroll outer loop. Since feed-in 49C code implodes, the blow-up will not be more than perhaps 2.5x. 50C * Micro-optimise critical lead-in code blocks. 51C * Clean up register use, e.g. r15 vs vp, disuse of nn, etc. 52C * Write n < 4 code specifically for Zen (current code is for Haswell). 53 54ABI_SUPPORT(DOS64) 55ABI_SUPPORT(STD64) 56 57ASM_START() 58 TEXT 59 ALIGN(32) 60PROLOGUE(mpn_mullo_basecase) 61 FUNC_ENTRY(4) 62 cmp $4, R32(n) 63 jae L(big) 64 65 mov vp_param, vp 66 mov (up), %rdx 67 68 cmp $2, R32(n) 69 jae L(gt1) 70L(n1): imul (vp), %rdx 71 mov %rdx, (rp) 72 FUNC_EXIT() 73 ret 74L(gt1): ja L(gt2) 75L(n2): mov (vp), %r9 76 mulx( %r9, %rax, %rdx) 77 mov %rax, (rp) 78 mov 8(up), %rax 79 imul %r9, %rax 80 add %rax, %rdx 81 mov 8(vp), %r9 82 mov (up), %rcx 83 imul %r9, %rcx 84 add %rcx, %rdx 85 mov %rdx, 8(rp) 86 FUNC_EXIT() 87 ret 88L(gt2): 89L(n3): mov (vp), %r9 90 mulx( %r9, %rax, %r10) C u0 x v0 91 mov %rax, (rp) 92 mov 8(up), %rdx 93 mulx( %r9, %rax, %rdx) C u1 x v0 94 imul 16(up), %r9 C u2 x v0 95 add %rax, %r10 96 adc %rdx, %r9 97 mov 8(vp), %r8 98 mov (up), %rdx 99 mulx( %r8, %rax, %rdx) C u0 x v1 100 add %rax, %r10 101 adc %rdx, %r9 102 imul 8(up), %r8 C u1 x v1 103 add %r8, %r9 104 mov %r10, 8(rp) 105 mov 16(vp), %r10 106 mov (up), %rax 107 imul %rax, %r10 C u0 x v2 108 add %r10, %r9 109 mov %r9, 16(rp) 110 FUNC_EXIT() 111 ret 112 113 ALIGN(16) 114L(big): push %r15 115 push %r14 116 push %r13 117 push %r12 118 push %rbp 119 push %rbx 120 121 mov (up), %r9 122 lea -8(up,n,8), up 123 lea -40(rp,n,8), rp 124 125 mov $4, R32(%r14) 126 sub n, %r14 127 mov -8(vp_param,n,8), %rbp 128 imul %r9, %rbp 129 lea 8(vp_param), %r15 130 mov (vp_param), %rdx 131 132 test $1, R8(%r14) 133 jnz L(mx0) 134L(mx1): test $2, R8(%r14) 135 jz L(mb3) 136 137L(mb1): mulx( %r9, %rbx, %rax) 138 lea -2(%r14), n 139 .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0xf0 C mulx -0x10(%rsi,%r14,8),%r9,%r8 140 .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r11,%r10 141 jmp L(mlo1) 142 143L(mb3): mulx( %r9, %r11, %r10) 144 .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0xf0 C mulx -0x10(%rsi,%r14,8),%r13,%r12 145 .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%rbx,%rax 146 lea (%r14), n 147 jrcxz L(x) 148 jmp L(mlo3) 149L(x): jmp L(mcor) 150 151L(mb2): mulx( %r9, %r13, %r12) 152 .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0xf0 C mulx -0x10(%rsi,%r14,8),%rbx,%rax 153 lea -1(%r14), n 154 .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r9,%r8 155 jmp L(mlo2) 156 157L(mx0): test $2, R8(%r14) 158 jz L(mb2) 159 160L(mb0): mulx( %r9, %r9, %r8) 161 .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0xf0 C mulx -0x10(%rsi,%r14,8),%r11,%r10 162 .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r13,%r12 163 lea -3(%r14), n 164 jmp L(mlo0) 165 166 ALIGN(16) 167L(mtop):jrcxz L(mend) 168 adc %r8, %r11 169 mov %r9, (rp,n,8) 170L(mlo3):.byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 171 adc %r10, %r13 172 mov %r11, 8(rp,n,8) 173L(mlo2):.byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 174 adc %r12, %rbx 175 mov %r13, 16(rp,n,8) 176L(mlo1):.byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 177 adc %rax, %r9 178 mov %rbx, 24(rp,n,8) 179L(mlo0):.byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax 180 lea 4(n), n 181 jmp L(mtop) 182 183L(mend):mov %r9, (rp) 184 adc %r8, %r11 185 mov %r11, 8(rp) 186 adc %r10, %r13 187 mov %r13, 16(rp) 188 adc %r12, %rbx 189 mov %rbx, 24(rp) 190 191L(outer): 192 mulx( (up), %r10, %r8) C FIXME r8 unused (use imul?) 193 adc %rax, %rbp 194 add %r10, %rbp 195 mov (%r15), %rdx 196 add $8, %r15 197 mov -24(up,%r14,8), %r8 198 lea -8(up), up 199 200 test $1, R8(%r14) 201 jz L(x0) 202L(x1): test $2, R8(%r14) 203 jnz L(b3) 204 205L(b1): mulx( %r8, %rbx, %rax) 206 lea -1(%r14), n 207 .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (%rsi,%rcx,8),%r9,%r8 208 .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 0x8(%rsi,%rcx,8),%r11,%r10 209 jmp L(lo1) 210 211L(x0): test $2, R8(%r14) 212 jz L(b2) 213 214L(b0): mulx( %r8, %r9, %r8) 215 lea -2(%r14), n 216 .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r11,%r10 217 .byte 0xc4,0x22,0x93,0xf6,0x24,0xf6 C mulx (%rsi,%r14,8),%r13,%r12 218 jmp L(lo0) 219 220L(b3): mulx( %r8, %r11, %r10) 221 lea 1(%r14), n 222 .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r13,%r12 223 .byte 0xc4,0xa2,0xe3,0xf6,0x04,0xf6 C mulx (%rsi,%r14,8),%rbx,%rax 224 add %r10, %r13 225 adc %r12, %rbx 226 adc $0, %rax 227 jrcxz L(cor) 228 jmp L(lo3) 229 230L(cor): add 8(rp), %r11 231 mov 16(rp), %r10 232 mov 24(rp), %r12 233L(mcor):mov %r11, 8(rp) 234 adc %r10, %r13 235 adc %r12, %rbx 236 mulx( (up), %r10, %r8) C FIXME r8 unused (use imul?) 237 adc %rax, %rbp 238 add %r10, %rbp 239 mov (%r15), %rdx 240 mov -24(up), %r8 241 mulx( %r8, %r9, %r12) 242 mulx( -16,(up), %r14, %rax) 243 add %r12, %r14 244 adc $0, %rax 245 adc %r9, %r13 246 mov %r13, 16(rp) 247 adc %r14, %rbx 248 mulx( -8,(up), %r10, %r8) C FIXME r8 unused (use imul?) 249 adc %rax, %rbp 250 add %r10, %rbp 251 mov 8(%r15), %rdx 252 mulx( -24,(up), %r14, %rax) 253 add %r14, %rbx 254 mov %rbx, 24(rp) 255 mulx( -16,(up), %r10, %r8) C FIXME r8 unused (use imul?) 256 adc %rax, %rbp 257 add %r10, %rbp 258 mov %rbp, 32(rp) 259 pop %rbx 260 pop %rbp 261 pop %r12 262 pop %r13 263 pop %r14 264 pop %r15 265 FUNC_EXIT() 266 ret 267 268L(b2): mulx( %r8, %r13, %r12) 269 lea (%r14), n 270 .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%rbx,%rax 271 add %r12, %rbx 272 adc $0, %rax 273 .byte 0xc4,0x22,0xb3,0xf6,0x04,0xf6 C mulx (%rsi,%r14,8),%r9,%r8 274 jmp L(lo2) 275 276 ALIGN(16) 277L(top): add %r9, (rp,n,8) 278L(lo3): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 279 adc %r11, 8(rp,n,8) 280L(lo2): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 281 adc %r13, 16(rp,n,8) 282L(lo1): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 283 adc %rbx, 24(rp,n,8) 284 adc %rax, %r9 285L(lo0): .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax 286 adc %r8, %r11 287 adc %r10, %r13 288 adc %r12, %rbx 289 adc $0, %rax 290 add $4, n 291 js L(top) 292 293 add %r9, (rp) 294 adc %r11, 8(rp) 295 adc %r13, 16(rp) 296 adc %rbx, 24(rp) 297 inc %r14 298 jmp L(outer) 299EPILOGUE() 300