1dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD Bulldozer. 2 3dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C cycles/limb 34C AMD K8,K9 3.30 3.58 35C AMD K10 3.09 36C AMD bull 4.47 4.72 37C AMD pile 4.66 38C AMD steam 39C AMD excavator 40C AMD bobcat 6.30 41C AMD jaguar 6.29 42C Intel P4 17.3 17.8 43C Intel core2 5.13 44C Intel NHM 4.85 45C Intel SBR 3.83 46C Intel IBR 3.75 47C Intel HWL 3.45 48C Intel BWL 2.56 49C Intel SKL 2.53 50C Intel atom 20.3 51C Intel SLM 9 52C VIA nano 53 54C The loop of this code is the result of running a code generation and 55C optimisation tool suite written by David Harvey and Torbjorn Granlund. 56 57C TODO 58C * Try to make loop run closer to 4 c/l in Bulldozer and Piledriver. 59 60define(`rp', `%rdi') C rcx 61define(`up', `%rsi') C rdx 62define(`n_param', `%rdx') C r8 63define(`v0', `%rcx') C r9 64 65define(`n', `%r11') 66 67ifdef(`OPERATION_addmul_1',` 68 define(`ADDSUB', `add') 69 define(`func', `mpn_addmul_1') 70') 71ifdef(`OPERATION_submul_1',` 72 define(`ADDSUB', `sub') 73 define(`func', `mpn_submul_1') 74') 75 76ABI_SUPPORT(DOS64) 77ABI_SUPPORT(STD64) 78 79MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) 80 81IFDOS(` define(`up', ``%rsi'') ') dnl 82IFDOS(` define(`rp', ``%rcx'') ') dnl 83IFDOS(` define(`v0', ``%r9'') ') dnl 84IFDOS(` define(`r9', ``rdi'') ') dnl 85IFDOS(` define(`n', ``%r8'') ') dnl 86IFDOS(` define(`r8', ``r11'') ') dnl 87 88ASM_START() 89 TEXT 90 ALIGN(16) 91PROLOGUE(func) 92IFDOS(``push %rsi '') 93IFDOS(``push %rdi '') 94IFDOS(``mov %rdx, %rsi '') 95 96 mov (up), %rax C read first u limb early 97 push %rbx 98IFSTD(` mov n_param, %rbx ') C move away n from rdx, mul uses it 99IFDOS(` mov n, %rbx ') 100 mul v0 101 102IFSTD(` mov %rbx, n ') 103 104 and $3, R32(%rbx) 105 lea -16(rp,n,8), rp 106 jz L(b0) 107 cmp $2, R32(%rbx) 108 jb L(b1) 109 jz L(b2) 110 111L(b3): mov $0, R32(%r8) 112 mov %rax, %rbx 113 mov $0, R32(%r9) 114 mov 8(up), %rax 115 mov %rdx, %r10 116 lea (up,n,8), up 117 not n 118 jmp L(L3) 119 120L(b0): mov $0, R32(%r10) 121 mov %rax, %r8 122 mov %rdx, %rbx 123 mov 8(up), %rax 124 lea (up,n,8), up 125 neg n 126 jmp L(L0) 127 128L(b1): cmp $1, n 129 jz L(n1) 130 mov %rax, %r9 131 mov 8(up), %rax 132 mov %rdx, %r8 133 mov $0, R32(%rbx) 134 lea (up,n,8), up 135 neg n 136 inc n 137 jmp L(L1) 138 139L(b2): mov $0, R32(%rbx) 140 mov %rax, %r10 141 mov %rdx, %r9 142 mov 8(up), %rax 143 mov $0, R32(%r8) 144 lea (up,n,8), up 145 neg n 146 add $2, n 147 jns L(end) 148 149 ALIGN(32) 150L(top): mul v0 151 ADDSUB %r10, (rp,n,8) 152 adc %rax, %r9 153 mov (up,n,8), %rax 154 adc %rdx, %r8 155L(L1): mul v0 156 mov $0, R32(%r10) 157 ADDSUB %r9, 8(rp,n,8) 158 adc %rax, %r8 159 adc %rdx, %rbx 160 mov 8(up,n,8), %rax 161L(L0): mul v0 162 ADDSUB %r8, 16(rp,n,8) 163 mov $0, R32(%r8) 164 adc %rax, %rbx 165 mov $0, R32(%r9) 166 mov 16(up,n,8), %rax 167 adc %rdx, %r10 168L(L3): mul v0 169 ADDSUB %rbx, 24(rp,n,8) 170 mov $0, R32(%rbx) 171 adc %rax, %r10 172 adc %rdx, %r9 173 mov 24(up,n,8), %rax 174 add $4, n 175 js L(top) 176 177L(end): mul v0 178 ADDSUB %r10, (rp) 179 adc %r9, %rax 180 adc %r8, %rdx 181L(n1): ADDSUB %rax, 8(rp) 182 adc $0, %rdx 183 mov %rdx, %rax 184 185 pop %rbx 186IFDOS(``pop %rdi '') 187IFDOS(``pop %rsi '') 188 ret 189EPILOGUE() 190ASM_END() 191