1dnl AMD64 mpn_mul_1 optimised for AMD bt1/bt2. 2 3dnl Copyright 2003-2005, 2007, 2008, 2011, 2012, 2019 Free Software 4dnl Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of either: 10dnl 11dnl * the GNU Lesser General Public License as published by the Free 12dnl Software Foundation; either version 3 of the License, or (at your 13dnl option) any later version. 14dnl 15dnl or 16dnl 17dnl * the GNU General Public License as published by the Free Software 18dnl Foundation; either version 2 of the License, or (at your option) any 19dnl later version. 20dnl 21dnl or both in parallel, as here. 22dnl 23dnl The GNU MP Library is distributed in the hope that it will be useful, but 24dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26dnl for more details. 27dnl 28dnl You should have received copies of the GNU General Public License and the 29dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30dnl see https://www.gnu.org/licenses/. 31 32include(`../config.m4') 33 34C cycles/limb 35C AMD K8,K9 4.53 old measurement 36C AMD K10 4.53 old measurement 37C AMD bd1 4.56 old measurement 38C AMD bd2 4.47 old measurement 39C AMD bd3 ? 40C AMD bd4 ? 41C AMD zen ? 42C AMD bt1 5.12 43C AMD bt2 5.17 44C Intel P4 12.6 old measurement 45C Intel PNR 4.53 old measurement 46C Intel NHM 4.36 old measurement 47C Intel SBR 3.0 old measurement 48C Intel IBR 2.55 old measurement 49C Intel HWL 2.28 old measurement 50C Intel BWL 2.36 old measurement 51C Intel SKL 2.39 old measurement 52C Intel atom 21.0 old measurement 53C Intel SLM 9 old measurement 54C Intel GLM ? 55C VIA nano ? 56 57C The loop of this code is the result of running a code generation and 58C optimisation tool suite written by David Harvey and Torbjorn Granlund. 59 60ABI_SUPPORT(DOS64) 61ABI_SUPPORT(STD64) 62 63C Standard parameters 64define(`rp', `%rdi') 65define(`up', `%rsi') 66define(`n_param', `%rdx') 67define(`v0', `%rcx') 68define(`cy', `%r8') 69C Standard allocations 70define(`n', `%rbx') 71define(`w0', `%r8') 72define(`w1', `%r9') 73define(`w2', `%r10') 74define(`w3', `%r11') 75 76C DOS64 parameters 77IFDOS(` define(`rp', `%rcx') ') dnl 78IFDOS(` define(`up', `%rsi') ') dnl 79IFDOS(` define(`n_param', `%r8') ') dnl 80IFDOS(` define(`v0', `%r9') ') dnl 81IFDOS(` define(`cy', `56(%rsp)')') dnl 82C DOS64 allocations 83IFDOS(` define(`n', `%rbx') ') dnl 84IFDOS(` define(`w0', `%r8') ') dnl 85IFDOS(` define(`w1', `%rdi') ') dnl 86IFDOS(` define(`w2', `%r10') ') dnl 87IFDOS(` define(`w3', `%r11') ') dnl 88 89 ALIGN(64) 90PROLOGUE(mpn_mul_1) 91IFDOS(` push %rsi ') 92IFDOS(` push %rdi ') 93IFDOS(` mov %rdx, %rsi ') 94 95 push %rbx 96 mov (up), %rax 97 98 lea (rp,n_param,8), rp 99 lea (up,n_param,8), up 100 mov n_param, n 101 102 test $1, R8(n_param) 103 jne L(bx1) 104 105L(bx0): mul v0 106 neg n 107 mov %rax, w0 108 mov %rdx, w1 109 test $2, R8(n) 110 jne L(L2) 111 112L(b00): add $2, n 113 jmp L(L0) 114 115 ALIGN(16) 116L(b11): mov %rax, w2 117 mov %rdx, w3 118 neg n 119 inc n 120 jmp L(L3) 121 122 ALIGN(16) 123L(bx1): mul v0 124 test $2, R8(n) 125 jne L(b11) 126 127L(b01): sub $3, n 128 jc L(n1) 129 mov %rax, w2 130 mov %rdx, w3 131 neg n 132 133 ALIGN(16) 134L(top): mov -16(up,n,8), %rax 135 mul v0 136 mov %rax, w0 137 mov %rdx, w1 138 mov w2, -24(rp,n,8) 139 add w3, w0 140 adc $0, w1 141L(L0): mov -8(up,n,8), %rax 142 mul v0 143 mov %rax, w2 144 mov %rdx, w3 145 mov w0, -16(rp,n,8) 146 add w1, w2 147 adc $0, w3 148L(L3): mov (up,n,8), %rax 149 mul v0 150 mov %rax, w0 151 mov %rdx, w1 152 mov w2, -8(rp,n,8) 153 add w3, w0 154 adc $0, w1 155L(L2): mov 8(up,n,8), %rax 156 mul v0 157 mov %rax, w2 158 mov %rdx, w3 159 mov w0, (rp,n,8) 160 add w1, w2 161 adc $0, w3 162 add $4, n 163 js L(top) 164 165L(end): mov w2, -8(rp) 166 mov w3, %rax 167 pop %rbx 168IFDOS(` pop %rdi ') 169IFDOS(` pop %rsi ') 170 ret 171 172 ALIGN(32) 173L(n1): mov %rax, -8(rp) 174 mov %rdx, %rax 175 pop %rbx 176IFDOS(` pop %rdi ') 177IFDOS(` pop %rsi ') 178 ret 179EPILOGUE() 180 181ASM_START() 182 TEXT 183 ALIGN(64) 184PROLOGUE(mpn_mul_1c) 185IFDOS(` push %rsi ') 186IFDOS(` push %rdi ') 187IFDOS(` mov %rdx, %rsi ') 188 mov cy, w2 189 push %rbx 190 mov (up), %rax 191 192 lea (rp,n_param,8), rp 193 lea (up,n_param,8), up 194 mov n_param, n 195 196 test $1, R8(n_param) 197 jne L(cx1) 198 199L(cx0): mul v0 200 neg n 201 mov %rax, w0 202 mov %rdx, w1 203 add w2, w0 204 adc $0, w1 205 test $2, R8(n) 206 jne L(L2) 207 208L(c00): add $2, n 209 jmp L(L0) 210 211 ALIGN(16) 212L(cx1): mul v0 213 test $2, R8(n) 214 je L(c01) 215 216L(c11): neg n 217 inc n 218 add %rax, w2 219 mov %rdx, w3 220 adc $0, w3 221 jmp L(L3) 222 223L(c01): cmp $1, n 224 jz L(m1) 225 neg n 226 add $3, n 227 add %rax, w2 228 mov %rdx, w3 229 adc $0, w3 230 jmp L(top) 231 232 ALIGN(32) 233L(m1): add %rax, w2 234 mov %rdx, %rax 235 mov w2, -8(rp) 236 adc $0, %rax 237 pop %rbx 238IFDOS(` pop %rdi ') 239IFDOS(` pop %rsi ') 240 ret 241EPILOGUE() 242