1dnl AMD64 mpn_addmul_2 optimised for AMD Bulldozer. 2 3dnl Copyright 2017 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C cycles/limb 34C AMD K8,K9 35C AMD K10 36C AMD bd1 4.2 37C AMD bd2 4.4 38C AMD bd3 39C AMD bd4 40C AMD zen 41C AMD bt1 42C AMD bt2 43C Intel P4 44C Intel PNR 45C Intel NHM 46C Intel SBR 47C Intel IBR 48C Intel HWL 49C Intel BWL 50C Intel SKL 51C Intel atom 52C Intel SLM 53C VIA nano 54 55C The loop of this code is the result of running a code generation and 56C optimisation tool suite written by David Harvey and Torbjorn Granlund. 57 58define(`rp', `%rdi') C rcx 59define(`up', `%rsi') C rdx 60define(`n_param', `%rdx') C r8 61define(`vp', `%rcx') C r9 62 63define(`n', `%rcx') 64define(`v0', `%rbx') 65define(`v1', `%rbp') 66define(`X0', `%r12') 67define(`X1', `%r13') 68 69define(`w0', `%r8') 70define(`w1', `%r9') 71define(`w2', `%r10') 72define(`w3', `%r11') 73 74ABI_SUPPORT(DOS64) 75ABI_SUPPORT(STD64) 76 77ASM_START() 78 TEXT 79 ALIGN(32) 80PROLOGUE(mpn_addmul_2) 81 FUNC_ENTRY(4) 82 push %rbx 83 push %rbp 84 push %r12 85 push %r13 86 87 mov (vp), v0 88 mov 8(vp), v1 89 90 mov (up), %rax 91 mov $0, R32(w2) C abuse w2 92 93 lea (up,n_param,8), up 94 lea (rp,n_param,8), rp 95 sub n_param, w2 96 mul v0 97 98 test $1, R8(w2) 99 jnz L(bx1) 100 101L(bx0): mov %rdx, X0 102 mov %rax, X1 103 test $2, R8(w2) 104 jnz L(b10) 105 106L(b00): lea (w2), n C un = 4, 8, 12, ... 107 mov (up,w2,8), %rax 108 mov (rp,w2,8), w3 109 mul v1 110 mov %rax, w0 111 mov 8(up,w2,8), %rax 112 mov %rdx, w1 113 jmp L(lo0) 114 115L(b10): lea 2(w2), n C un = 2, 6, 10, ... 116 mov (up,w2,8), %rax 117 mov (rp,w2,8), w1 118 mul v1 119 mov %rdx, w3 120 mov %rax, w2 121 mov -8(up,n,8), %rax 122 test n, n 123 jz L(end) 124 jmp L(top) 125 126L(bx1): mov %rax, X0 127 mov %rdx, X1 128 test $2, R8(w2) 129 jz L(b11) 130 131L(b01): lea 1(w2), n C un = 1, 5, 9, ... 132 mov (up,w2,8), %rax 133 mul v1 134 mov (rp,w2,8), w2 135 mov %rdx, w0 136 mov %rax, w3 137 jmp L(lo1) 138 139L(b11): lea -1(w2), n C un = 3, 7, 11, ... 140 mov (up,w2,8), %rax 141 mul v1 142 mov (rp,w2,8), w0 143 mov %rax, w1 144 mov 8(up,w2,8), %rax 145 mov %rdx, w2 146 jmp L(lo3) 147 148 ALIGN(32) 149L(top): 150L(lo2): mul v0 151 add w1, X1 152 mov X1, -16(rp,n,8) 153 mov %rdx, X1 154 adc %rax, X0 155 adc $0, X1 156 mov -8(up,n,8), %rax 157 mul v1 158 mov -8(rp,n,8), w1 159 mov %rdx, w0 160 add w1, w2 161 adc %rax, w3 162 adc $0, w0 163L(lo1): mov (up,n,8), %rax 164 mul v0 165 add w2, X0 166 mov X0, -8(rp,n,8) 167 mov %rdx, X0 168 adc %rax, X1 169 mov (up,n,8), %rax 170 adc $0, X0 171 mov (rp,n,8), w2 172 mul v1 173 add w2, w3 174 adc %rax, w0 175 mov 8(up,n,8), %rax 176 mov %rdx, w1 177 adc $0, w1 178L(lo0): mul v0 179 add w3, X1 180 mov X1, (rp,n,8) 181 adc %rax, X0 182 mov 8(up,n,8), %rax 183 mov %rdx, X1 184 adc $0, X1 185 mov 8(rp,n,8), w3 186 mul v1 187 add w3, w0 188 adc %rax, w1 189 mov 16(up,n,8), %rax 190 mov %rdx, w2 191 adc $0, w2 192L(lo3): mul v0 193 add w0, X0 194 mov X0, 8(rp,n,8) 195 mov %rdx, X0 196 adc %rax, X1 197 adc $0, X0 198 mov 16(up,n,8), %rax 199 mov 16(rp,n,8), w0 200 mul v1 201 mov %rdx, w3 202 add w0, w1 203 adc %rax, w2 204 adc $0, w3 205 mov 24(up,n,8), %rax 206 add $4, n 207 jnc L(top) 208 209L(end): mul v0 210 add w1, X1 211 mov X1, -16(rp) 212 mov %rdx, X1 213 adc %rax, X0 214 adc $0, X1 215 mov -8(up), %rax 216 mul v1 217 mov -8(rp), w1 218 add w1, w2 219 adc %rax, w3 220 adc $0, %rdx 221 add w2, X0 222 adc $0, X1 223 mov X0, -8(rp) 224 add w3, X1 225 mov X1, (rp) 226 adc $0, %rdx 227 mov %rdx, %rax 228 229 pop %r13 230 pop %r12 231 pop %rbp 232 pop %rbx 233 FUNC_EXIT() 234 ret 235EPILOGUE() 236