1dnl AMD64 mpn_addmul_2 optimised for Intel Haswell. 2 3dnl Contributed to the GNU project by Torbj��rn Granlund. 4 5dnl Copyright 2013 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb 36C AMD K8,K9 n/a 37C AMD K10 n/a 38C AMD bull n/a 39C AMD pile n/a 40C AMD steam n/a 41C AMD excavator ? 42C AMD bobcat n/a 43C AMD jaguar n/a 44C Intel P4 n/a 45C Intel core n/a 46C Intel NHM n/a 47C Intel SBR n/a 48C Intel IBR n/a 49C Intel HWL 2.15 50C Intel BWL 2.33 51C Intel SKL 2.22 52C Intel atom n/a 53C Intel SLM n/a 54C VIA nano n/a 55 56C The loop of this code is the result of running a code generation and 57C optimisation tool suite written by David Harvey and Torbj��rn Granlund. 58 59define(`rp', `%rdi') 60define(`up', `%rsi') 61define(`n_param',`%rdx') 62define(`vp', `%rcx') 63 64define(`v0', `%r8') 65define(`v1', `%r9') 66define(`w0', `%rbx') 67define(`w1', `%rcx') 68define(`w2', `%rbp') 69define(`w3', `%r10') 70define(`n', `%r11') 71define(`X0', `%r12') 72define(`X1', `%r13') 73 74ABI_SUPPORT(DOS64) 75ABI_SUPPORT(STD64) 76 77ASM_START() 78 TEXT 79 ALIGN(32) 80PROLOGUE(mpn_addmul_2) 81 FUNC_ENTRY(4) 82 push %rbx 83 push %rbp 84 push %r12 85 push %r13 86 87 mov (vp), v0 88 mov 8(vp), v1 89 90 mov n_param, n 91 shr $2, n 92 93 test $1, R8(n_param) 94 jnz L(bx1) 95 96L(bx0): mov (rp), X0 97 mov 8(rp), X1 98 test $2, R8(n_param) 99 jnz L(b10) 100 101L(b00): mov (up), %rdx 102 lea 16(up), up 103 mulx( v0, %rax, w1) 104 add %rax, X0 105 mulx( v1, %rax, w2) 106 adc $0, w1 107 mov X0, (rp) 108 add %rax, X1 109 adc $0, w2 110 mov -8(up), %rdx 111 lea 16(rp), rp 112 jmp L(lo0) 113 114L(b10): mov (up), %rdx 115 inc n 116 mulx( v0, %rax, w1) 117 add %rax, X0 118 adc $0, w1 119 mulx( v1, %rax, w2) 120 mov X0, (rp) 121 mov 16(rp), X0 122 add %rax, X1 123 adc $0, w2 124 xor w0, w0 125 jmp L(lo2) 126 127L(bx1): mov (rp), X1 128 mov 8(rp), X0 129 test $2, R8(n_param) 130 jnz L(b11) 131 132L(b01): mov (up), %rdx 133 mulx( v0, %rax, w3) 134 add %rax, X1 135 adc $0, w3 136 mulx( v1, %rax, w0) 137 add %rax, X0 138 adc $0, w0 139 mov 8(up), %rdx 140 mov X1, (rp) 141 mov 16(rp), X1 142 mulx( v0, %rax, w1) 143 lea 24(rp), rp 144 lea 24(up), up 145 jmp L(lo1) 146 147L(b11): mov (up), %rdx 148 inc n 149 mulx( v0, %rax, w3) 150 add %rax, X1 151 adc $0, w3 152 mulx( v1, %rax, w0) 153 add %rax, X0 154 adc $0, w0 155 mov X1, (rp) 156 mov 8(up), %rdx 157 mulx( v0, %rax, w1) 158 lea 8(rp), rp 159 lea 8(up), up 160 jmp L(lo3) 161 162 ALIGN(16) 163L(top): mulx( v0, %rax, w3) 164 add w0, X1 165 adc $0, w2 166 add %rax, X1 167 adc $0, w3 168 mulx( v1, %rax, w0) 169 add %rax, X0 170 adc $0, w0 171 lea 32(rp), rp 172 add w1, X1 173 mov -16(up), %rdx 174 mov X1, -24(rp) 175 adc $0, w3 176 add w2, X0 177 mov -8(rp), X1 178 mulx( v0, %rax, w1) 179 adc $0, w0 180L(lo1): add %rax, X0 181 mulx( v1, %rax, w2) 182 adc $0, w1 183 add w3, X0 184 mov X0, -16(rp) 185 adc $0, w1 186 add %rax, X1 187 adc $0, w2 188 add w0, X1 189 mov -8(up), %rdx 190 adc $0, w2 191L(lo0): mulx( v0, %rax, w3) 192 add %rax, X1 193 adc $0, w3 194 mov (rp), X0 195 mulx( v1, %rax, w0) 196 add %rax, X0 197 adc $0, w0 198 add w1, X1 199 mov X1, -8(rp) 200 adc $0, w3 201 mov (up), %rdx 202 add w2, X0 203 mulx( v0, %rax, w1) 204 adc $0, w0 205L(lo3): add %rax, X0 206 adc $0, w1 207 mulx( v1, %rax, w2) 208 add w3, X0 209 mov 8(rp), X1 210 mov X0, (rp) 211 mov 16(rp), X0 212 adc $0, w1 213 add %rax, X1 214 adc $0, w2 215L(lo2): mov 8(up), %rdx 216 lea 32(up), up 217 dec n 218 jnz L(top) 219 220L(end): mulx( v0, %rax, w3) 221 add w0, X1 222 adc $0, w2 223 add %rax, X1 224 adc $0, w3 225 mulx( v1, %rdx, %rax) 226 add w1, X1 227 mov X1, 8(rp) 228 adc $0, w3 229 add w2, %rdx 230 adc $0, %rax 231 add w3, %rdx 232 mov %rdx, 16(rp) 233 adc $0, %rax 234 235 pop %r13 236 pop %r12 237 pop %rbp 238 pop %rbx 239 FUNC_EXIT() 240 ret 241EPILOGUE() 242