1290001Sglebiusdnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD bt1/bt2. 2290001Sglebius 3290001Sglebiusdnl Copyright 2003-2005, 2007, 2008, 2011, 2012, 2018-2019 Free Software 4290001Sglebiusdnl Foundation, Inc. 5290001Sglebius 6290001Sglebiusdnl This file is part of the GNU MP Library. 7290001Sglebiusdnl 8290001Sglebiusdnl The GNU MP Library is free software; you can redistribute it and/or modify 9290001Sglebiusdnl it under the terms of either: 10290001Sglebiusdnl 11290001Sglebiusdnl * the GNU Lesser General Public License as published by the Free 12290001Sglebiusdnl Software Foundation; either version 3 of the License, or (at your 13290001Sglebiusdnl option) any later version. 14290001Sglebiusdnl 15290001Sglebiusdnl or 16290001Sglebiusdnl 17290001Sglebiusdnl * the GNU General Public License as published by the Free Software 18290001Sglebiusdnl Foundation; either version 2 of the License, or (at your option) any 19290001Sglebiusdnl later version. 20290001Sglebiusdnl 21290001Sglebiusdnl or both in parallel, as here. 22290001Sglebiusdnl 23290001Sglebiusdnl The GNU MP Library is distributed in the hope that it will be useful, but 24290001Sglebiusdnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25290001Sglebiusdnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26290001Sglebiusdnl for more details. 27290001Sglebiusdnl 28290001Sglebiusdnl You should have received copies of the GNU General Public License and the 29290001Sglebiusdnl GNU Lesser General Public License along with the GNU MP Library. If not, 30290001Sglebiusdnl see https://www.gnu.org/licenses/. 31290001Sglebius 32290001Sglebiusinclude(`../config.m4') 33290001Sglebius 34290001SglebiusC cycles/limb 35290001SglebiusC AMD K8,K9 4.52 old measurement 36290001SglebiusC AMD K10 4.51 old measurement 37290001SglebiusC AMD bd1 4.66 old measurement 38290001SglebiusC AMD bd2 4.57 old measurement 39290001SglebiusC AMD bd3 ? 40290001SglebiusC AMD bd4 ? 41290001SglebiusC AMD zen ? 42290001SglebiusC AMD bt1 5.04 43290001SglebiusC AMD bt2 5.07 44290001SglebiusC Intel P4 16.8 18.6 old measurement 45290001SglebiusC Intel PNR 5.59 old measurement 46290001SglebiusC Intel NHM 5.39 old measurement 47290001SglebiusC Intel SBR 3.93 old measurement 48290001SglebiusC Intel IBR 3.59 old measurement 49290001SglebiusC Intel HWL 3.61 old measurement 50290001SglebiusC Intel BWL 2.76 old measurement 51290001SglebiusC Intel SKL 2.77 old measurement 52290001SglebiusC Intel atom 23 old measurement 53290001SglebiusC Intel SLM 8 old measurement 54290001SglebiusC Intel GLM ? 55290001SglebiusC VIA nano 5.63 old measurement 56290001Sglebius 57290001SglebiusC The ALIGNment here might look completely ad-hoc. They are not. 58290001Sglebius 59290001SglebiusABI_SUPPORT(DOS64) 60290001SglebiusABI_SUPPORT(STD64) 61290001Sglebius 62290001Sglebiusifdef(`OPERATION_addmul_1',` 63290001Sglebius define(`ADDSUB', `add') 64290001Sglebius define(`func', `mpn_addmul_1') 65290001Sglebius') 66290001Sglebiusifdef(`OPERATION_submul_1',` 67290001Sglebius define(`ADDSUB', `sub') 68290001Sglebius define(`func', `mpn_submul_1') 69290001Sglebius') 70290001Sglebius 71290001SglebiusMULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) 72290001Sglebius 73290001SglebiusC Standard parameters 74290001Sglebiusdefine(`rp', `%rdi') 75290001Sglebiusdefine(`up', `%rsi') 76290001Sglebiusdefine(`n_param', `%rdx') 77290001Sglebiusdefine(`v0', `%rcx') 78290001SglebiusC Standard allocations 79290001Sglebiusdefine(`n', `%rbx') 80290001Sglebiusdefine(`w0', `%r8') 81290001Sglebiusdefine(`w1', `%r9') 82290001Sglebiusdefine(`w2', `%r10') 83290001Sglebiusdefine(`w3', `%r11') 84290001Sglebius 85290001SglebiusC DOS64 parameters 86290001SglebiusIFDOS(` define(`rp', `%rcx') ') dnl 87290001SglebiusIFDOS(` define(`up', `%rsi') ') dnl 88290001SglebiusIFDOS(` define(`n_param', `%r8') ') dnl 89290001SglebiusIFDOS(` define(`v0', `%r9') ') dnl 90290001SglebiusC DOS64 allocations 91290001SglebiusIFDOS(` define(`n', `%rbx') ') dnl 92290001SglebiusIFDOS(` define(`w0', `%r8') ') dnl 93290001SglebiusIFDOS(` define(`w1', `%rdi') ') dnl 94290001SglebiusIFDOS(` define(`w2', `%r10') ') dnl 95290001SglebiusIFDOS(` define(`w3', `%r11') ') dnl 96290001Sglebius 97290001SglebiusASM_START() 98290001Sglebius TEXT 99290001Sglebius ALIGN(64) 100290001SglebiusPROLOGUE(func) 101290001SglebiusIFDOS(` push %rsi ') 102290001SglebiusIFDOS(` push %rdi ') 103290001SglebiusIFDOS(` mov %rdx, %rsi ') 104290001Sglebius 105290001Sglebius push %rbx 106290001Sglebius mov (up), %rax 107290001Sglebius 108290001Sglebius lea (rp,n_param,8), rp 109290001Sglebius lea (up,n_param,8), up 110290001Sglebius mov n_param, n 111290001Sglebius 112290001Sglebius test $1, R8(n_param) 113290001Sglebius jne L(bx1) 114290001Sglebius 115290001SglebiusL(bx0): mul v0 116290001Sglebius neg n 117290001Sglebius mov %rax, w0 118290001Sglebius mov %rdx, w1 119290001Sglebius test $2, R8(n) 120290001Sglebius jne L(L2) 121290001Sglebius 122290001SglebiusL(b00): add $2, n 123290001Sglebius jmp L(L0) 124290001Sglebius 125290001Sglebius ALIGN(16) 126290001SglebiusL(bx1): mul v0 127290001Sglebius test $2, R8(n) 128290001Sglebius je L(b01) 129290001Sglebius 130290001SglebiusL(b11): mov %rax, w2 131290001Sglebius mov %rdx, w3 132290001Sglebius neg n 133290001Sglebius inc n 134290001Sglebius jmp L(L3) 135290001Sglebius 136290001Sglebius ALIGN(16) 137290001SglebiusL(b01): sub $3, n 138290001Sglebius jc L(n1) 139290001Sglebius mov %rax, w2 140290001Sglebius mov %rdx, w3 141290001Sglebius neg n 142290001Sglebius 143290001Sglebius ALIGN(16) 144290001SglebiusL(top): mov -16(up,n,8), %rax 145290001Sglebius mul v0 146290001Sglebius mov %rax, w0 147290001Sglebius mov %rdx, w1 148290001Sglebius ADDSUB w2, -24(rp,n,8) 149290001Sglebius adc w3, w0 150290001Sglebius adc $0, w1 151290001SglebiusL(L0): mov -8(up,n,8), %rax 152290001Sglebius mul v0 153290001Sglebius mov %rax, w2 154290001Sglebius mov %rdx, w3 155290001Sglebius ADDSUB w0, -16(rp,n,8) 156290001Sglebius adc w1, w2 157290001Sglebius adc $0, w3 158290001SglebiusL(L3): mov (up,n,8), %rax 159290001Sglebius mul v0 160290001Sglebius mov %rax, w0 161290001Sglebius mov %rdx, w1 162290001Sglebius ADDSUB w2, -8(rp,n,8) 163290001Sglebius adc w3, w0 164290001Sglebius adc $0, w1 165290001SglebiusL(L2): mov 8(up,n,8), %rax 166290001Sglebius mul v0 167290001Sglebius mov %rax, w2 168290001Sglebius mov %rdx, w3 169290001Sglebius ADDSUB w0, (rp,n,8) 170290001Sglebius adc w1, w2 171290001Sglebius adc $0, w3 172290001Sglebius add $4, n 173290001Sglebius js L(top) 174290001Sglebius 175290001SglebiusL(end): xor R32(%rax), R32(%rax) 176290001Sglebius ADDSUB w2, -8(rp) 177290001Sglebius adc w3, %rax 178290001Sglebius pop %rbx 179290001SglebiusIFDOS(` pop %rdi ') 180290001SglebiusIFDOS(` pop %rsi ') 181290001Sglebius ret 182290001Sglebius 183290001Sglebius ALIGN(32) 184290001SglebiusL(n1): ADDSUB %rax, -8(rp) 185290001Sglebius mov $0, R32(%rax) 186290001Sglebius adc %rdx, %rax 187290001Sglebius pop %rbx 188290001SglebiusIFDOS(` pop %rdi ') 189290001SglebiusIFDOS(` pop %rsi ') 190290001Sglebius ret 191290001SglebiusEPILOGUE() 192290001Sglebius