1dnl AMD64 mpn_add_n, mpn_sub_n optimised for Sandy bridge, Ivy bridge, and 2dnl Haswell. 3 4dnl Contributed to the GNU project by Torbj��rn Granlund. 5 6dnl Copyright 2003-2005, 2007, 2008, 2010-2013 Free Software Foundation, Inc. 7 8dnl This file is part of the GNU MP Library. 9dnl 10dnl The GNU MP Library is free software; you can redistribute it and/or modify 11dnl it under the terms of either: 12dnl 13dnl * the GNU Lesser General Public License as published by the Free 14dnl Software Foundation; either version 3 of the License, or (at your 15dnl option) any later version. 16dnl 17dnl or 18dnl 19dnl * the GNU General Public License as published by the Free Software 20dnl Foundation; either version 2 of the License, or (at your option) any 21dnl later version. 22dnl 23dnl or both in parallel, as here. 24dnl 25dnl The GNU MP Library is distributed in the hope that it will be useful, but 26dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 27dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 28dnl for more details. 29dnl 30dnl You should have received copies of the GNU General Public License and the 31dnl GNU Lesser General Public License along with the GNU MP Library. If not, 32dnl see https://www.gnu.org/licenses/. 33 34include(`../config.m4') 35 36C cycles/limb 37C AMD K8,K9 1.75\2.52 38C AMD K10 1.5 39C AMD bd1 1.69\2.25 40C AMD bd2 1.65 41C AMD bd3 ? 42C AMD bd4 ? 43C AMD zen 1.5 44C AMD bt1 2.67 45C AMD bt2 2.16 46C Intel P4 11.54 47C Intel PNR 5 48C Intel NHM 5.5 49C Intel SBR 1.54 50C Intel IBR 1.5 51C Intel HWL 1.32 52C Intel BWL 1.07 53C Intel SKL 1.21 54C Intel atom 4.3 55C Intel SLM 3 56C VIA nano ? 57 58C The loop of this code was manually written. It runs close to optimally on 59C Intel SBR, IBR, and HWL far as we know, except for the fluctuation problems. 60C It also runs slightly faster on average on AMD bd1 and bd2. 61C 62C No micro-optimisation has been done. 63C 64C N.B.! The loop alignment padding insns are executed. If editing the code, 65C make sure the padding does not become excessive. It is now a 4-byte nop. 66 67define(`rp', `%rdi') C rcx 68define(`up', `%rsi') C rdx 69define(`vp', `%rdx') C r8 70define(`n', `%rcx') C r9 71define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc) 72 73ifdef(`OPERATION_add_n', ` 74 define(ADCSBB, adc) 75 define(func, mpn_add_n) 76 define(func_nc, mpn_add_nc)') 77ifdef(`OPERATION_sub_n', ` 78 define(ADCSBB, sbb) 79 define(func, mpn_sub_n) 80 define(func_nc, mpn_sub_nc)') 81 82MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) 83 84ABI_SUPPORT(DOS64) 85ABI_SUPPORT(STD64) 86 87ASM_START() 88 TEXT 89 ALIGN(32) 90PROLOGUE(func) 91 FUNC_ENTRY(4) 92 xor %r8, %r8 93 94L(ent): mov R32(n), R32(%rax) 95 shr $2, n 96 97 test $1, R8(%rax) 98 jnz L(bx1) 99 100L(bx0): test $2, R8(%rax) 101 jnz L(b10) 102 103L(b00): neg %r8 104 mov (up), %r8 105 mov 8(up), %r9 106 ADCSBB (vp), %r8 107 ADCSBB 8(vp), %r9 108 mov 16(up), %r10 109 mov 24(up), %r11 110 lea 32(up), up 111 ADCSBB 16(vp), %r10 112 ADCSBB 24(vp), %r11 113 lea 32(vp), vp 114 lea -16(rp), rp 115 jmp L(lo0) 116 117L(b10): neg %r8 118 mov (up), %r10 119 mov 8(up), %r11 120 ADCSBB 0(vp), %r10 121 ADCSBB 8(vp), %r11 122 jrcxz L(e2) 123 mov 16(up), %r8 124 mov 24(up), %r9 125 lea 16(up), up 126 ADCSBB 16(vp), %r8 127 ADCSBB 24(vp), %r9 128 lea 16(vp), vp 129C lea (rp), rp 130 jmp L(lo2) 131 132L(e2): mov %r10, (rp) 133 mov %r11, 8(rp) 134 setc R8(%rax) 135 FUNC_EXIT() 136 ret 137 138L(bx1): test $2, R8(%rax) 139 jnz L(b11) 140 141L(b01): neg %r8 142 mov (up), %r11 143 ADCSBB (vp), %r11 144 jrcxz L(e1) 145 mov 8(up), %r8 146 mov 16(up), %r9 147 lea 8(up), up 148 lea -8(rp), rp 149 ADCSBB 8(vp), %r8 150 ADCSBB 16(vp), %r9 151 lea 8(vp), vp 152 jmp L(lo1) 153 154L(e1): mov %r11, (rp) 155 setc R8(%rax) 156 FUNC_EXIT() 157 ret 158 159L(b11): neg %r8 160 mov (up), %r9 161 ADCSBB (vp), %r9 162 mov 8(up), %r10 163 mov 16(up), %r11 164 lea 24(up), up 165 ADCSBB 8(vp), %r10 166 ADCSBB 16(vp), %r11 167 lea 24(vp), vp 168 mov %r9, (rp) 169 lea 8(rp), rp 170 jrcxz L(end) 171 172 ALIGN(32) 173L(top): mov (up), %r8 174 mov 8(up), %r9 175 ADCSBB (vp), %r8 176 ADCSBB 8(vp), %r9 177L(lo2): mov %r10, (rp) 178L(lo1): mov %r11, 8(rp) 179 mov 16(up), %r10 180 mov 24(up), %r11 181 lea 32(up), up 182 ADCSBB 16(vp), %r10 183 ADCSBB 24(vp), %r11 184 lea 32(vp), vp 185L(lo0): mov %r8, 16(rp) 186L(lo3): mov %r9, 24(rp) 187 lea 32(rp), rp 188 dec n 189 jnz L(top) 190 191L(end): mov R32(n), R32(%rax) C zero rax 192 mov %r10, (rp) 193 mov %r11, 8(rp) 194 setc R8(%rax) 195 FUNC_EXIT() 196 ret 197EPILOGUE() 198 ALIGN(16) 199PROLOGUE(func_nc) 200 FUNC_ENTRY(4) 201IFDOS(` mov 56(%rsp), %r8 ') 202 jmp L(ent) 203EPILOGUE() 204