1dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD bt1/bt2. 2 3dnl Copyright 2003-2005, 2007, 2008, 2011, 2012, 2018-2019 Free Software 4dnl Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of either: 10dnl 11dnl * the GNU Lesser General Public License as published by the Free 12dnl Software Foundation; either version 3 of the License, or (at your 13dnl option) any later version. 14dnl 15dnl or 16dnl 17dnl * the GNU General Public License as published by the Free Software 18dnl Foundation; either version 2 of the License, or (at your option) any 19dnl later version. 20dnl 21dnl or both in parallel, as here. 22dnl 23dnl The GNU MP Library is distributed in the hope that it will be useful, but 24dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26dnl for more details. 27dnl 28dnl You should have received copies of the GNU General Public License and the 29dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30dnl see https://www.gnu.org/licenses/. 31 32include(`../config.m4') 33 34C cycles/limb 35C AMD K8,K9 4.52 old measurement 36C AMD K10 4.51 old measurement 37C AMD bd1 4.66 old measurement 38C AMD bd2 4.57 old measurement 39C AMD bd3 ? 40C AMD bd4 ? 41C AMD zen ? 42C AMD bt1 5.04 43C AMD bt2 5.07 44C Intel P4 16.8 18.6 old measurement 45C Intel PNR 5.59 old measurement 46C Intel NHM 5.39 old measurement 47C Intel SBR 3.93 old measurement 48C Intel IBR 3.59 old measurement 49C Intel HWL 3.61 old measurement 50C Intel BWL 2.76 old measurement 51C Intel SKL 2.77 old measurement 52C Intel atom 23 old measurement 53C Intel SLM 8 old measurement 54C Intel GLM ? 55C VIA nano 5.63 old measurement 56 57C The ALIGNment here might look completely ad-hoc. They are not. 58 59ABI_SUPPORT(DOS64) 60ABI_SUPPORT(STD64) 61 62ifdef(`OPERATION_addmul_1',` 63 define(`ADDSUB', `add') 64 define(`func', `mpn_addmul_1') 65') 66ifdef(`OPERATION_submul_1',` 67 define(`ADDSUB', `sub') 68 define(`func', `mpn_submul_1') 69') 70 71MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) 72 73C Standard parameters 74define(`rp', `%rdi') 75define(`up', `%rsi') 76define(`n_param', `%rdx') 77define(`v0', `%rcx') 78C Standard allocations 79define(`n', `%rbx') 80define(`w0', `%r8') 81define(`w1', `%r9') 82define(`w2', `%r10') 83define(`w3', `%r11') 84 85C DOS64 parameters 86IFDOS(` define(`rp', `%rcx') ') dnl 87IFDOS(` define(`up', `%rsi') ') dnl 88IFDOS(` define(`n_param', `%r8') ') dnl 89IFDOS(` define(`v0', `%r9') ') dnl 90C DOS64 allocations 91IFDOS(` define(`n', `%rbx') ') dnl 92IFDOS(` define(`w0', `%r8') ') dnl 93IFDOS(` define(`w1', `%rdi') ') dnl 94IFDOS(` define(`w2', `%r10') ') dnl 95IFDOS(` define(`w3', `%r11') ') dnl 96 97ASM_START() 98 TEXT 99 ALIGN(64) 100PROLOGUE(func) 101IFDOS(` push %rsi ') 102IFDOS(` push %rdi ') 103IFDOS(` mov %rdx, %rsi ') 104 105 push %rbx 106 mov (up), %rax 107 108 lea (rp,n_param,8), rp 109 lea (up,n_param,8), up 110 mov n_param, n 111 112 test $1, R8(n_param) 113 jne L(bx1) 114 115L(bx0): mul v0 116 neg n 117 mov %rax, w0 118 mov %rdx, w1 119 test $2, R8(n) 120 jne L(L2) 121 122L(b00): add $2, n 123 jmp L(L0) 124 125 ALIGN(16) 126L(bx1): mul v0 127 test $2, R8(n) 128 je L(b01) 129 130L(b11): mov %rax, w2 131 mov %rdx, w3 132 neg n 133 inc n 134 jmp L(L3) 135 136 ALIGN(16) 137L(b01): sub $3, n 138 jc L(n1) 139 mov %rax, w2 140 mov %rdx, w3 141 neg n 142 143 ALIGN(16) 144L(top): mov -16(up,n,8), %rax 145 mul v0 146 mov %rax, w0 147 mov %rdx, w1 148 ADDSUB w2, -24(rp,n,8) 149 adc w3, w0 150 adc $0, w1 151L(L0): mov -8(up,n,8), %rax 152 mul v0 153 mov %rax, w2 154 mov %rdx, w3 155 ADDSUB w0, -16(rp,n,8) 156 adc w1, w2 157 adc $0, w3 158L(L3): mov (up,n,8), %rax 159 mul v0 160 mov %rax, w0 161 mov %rdx, w1 162 ADDSUB w2, -8(rp,n,8) 163 adc w3, w0 164 adc $0, w1 165L(L2): mov 8(up,n,8), %rax 166 mul v0 167 mov %rax, w2 168 mov %rdx, w3 169 ADDSUB w0, (rp,n,8) 170 adc w1, w2 171 adc $0, w3 172 add $4, n 173 js L(top) 174 175L(end): xor R32(%rax), R32(%rax) 176 ADDSUB w2, -8(rp) 177 adc w3, %rax 178 pop %rbx 179IFDOS(` pop %rdi ') 180IFDOS(` pop %rsi ') 181 ret 182 183 ALIGN(32) 184L(n1): ADDSUB %rax, -8(rp) 185 mov $0, R32(%rax) 186 adc %rdx, %rax 187 pop %rbx 188IFDOS(` pop %rdi ') 189IFDOS(` pop %rsi ') 190 ret 191EPILOGUE() 192