1dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Haswell. 2 3dnl Contributed to the GNU project by Torbj��rn Granlund. 4 5dnl Copyright 2013 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb 36C AMD K8,K9 - 37C AMD K10 - 38C AMD bull - 39C AMD pile - 40C AMD steam - 41C AMD excavator - 42C AMD bobcat - 43C AMD jaguar - 44C Intel P4 - 45C Intel core2 - 46C Intel NHM - 47C Intel SBR - 48C Intel IBR - 49C Intel HWL 2.32 50C Intel BWL 2.04 51C Intel SKL 1.95 52C Intel atom - 53C Intel SLM - 54C VIA nano - 55 56C The loop of this code is the result of running a code generation and 57C optimisation tool suite written by David Harvey and Torbj��rn Granlund. 58 59C TODO 60C * Handle small n separately, for lower overhead. 61 62define(`rp', `%rdi') C rcx 63define(`up', `%rsi') C rdx 64define(`n_param', `%rdx') C r8 65define(`v0_param',`%rcx') C r9 66 67define(`n', `%rbp') 68define(`v0', `%rdx') 69 70ifdef(`OPERATION_addmul_1',` 71 define(`ADDSUB', `add') 72 define(`ADCSBB', `adc') 73 define(`func', `mpn_addmul_1') 74') 75ifdef(`OPERATION_submul_1',` 76 define(`ADDSUB', `sub') 77 define(`ADCSBB', `sbb') 78 define(`func', `mpn_submul_1') 79') 80 81ABI_SUPPORT(DOS64) 82ABI_SUPPORT(STD64) 83 84MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) 85 86ASM_START() 87 TEXT 88 ALIGN(16) 89PROLOGUE(func) 90 FUNC_ENTRY(4) 91 push %rbx 92 push %rbp 93 push %r12 94 push %r13 95 96 mov n_param, n 97 mov v0_param, v0 98 99 test $1, R8(n) 100 jnz L(bx1) 101 102L(bx0): shr $2, n 103 jc L(b10) 104 105L(b00): mulx( (up), %r13, %r12) 106 mulx( 8,(up), %rbx, %rax) 107 add %r12, %rbx 108 adc $0, %rax 109 mov (rp), %r12 110 mov 8(rp), %rcx 111 mulx( 16,(up), %r9, %r8) 112 lea -16(rp), rp 113 lea 16(up), up 114 ADDSUB %r13, %r12 115 jmp L(lo0) 116 117L(bx1): shr $2, n 118 jc L(b11) 119 120L(b01): mulx( (up), %r11, %r10) 121 jnz L(gt1) 122L(n1): ADDSUB %r11, (rp) 123 mov $0, R32(%rax) 124 adc %r10, %rax 125 jmp L(ret) 126 127L(gt1): mulx( 8,(up), %r13, %r12) 128 mulx( 16,(up), %rbx, %rax) 129 lea 24(up), up 130 add %r10, %r13 131 adc %r12, %rbx 132 adc $0, %rax 133 mov (rp), %r10 134 mov 8(rp), %r12 135 mov 16(rp), %rcx 136 lea -8(rp), rp 137 ADDSUB %r11, %r10 138 jmp L(lo1) 139 140L(b11): mulx( (up), %rbx, %rax) 141 mov (rp), %rcx 142 mulx( 8,(up), %r9, %r8) 143 lea 8(up), up 144 lea -24(rp), rp 145 inc n C adjust n 146 ADDSUB %rbx, %rcx 147 jmp L(lo3) 148 149L(b10): mulx( (up), %r9, %r8) 150 mulx( 8,(up), %r11, %r10) 151 lea -32(rp), rp 152 mov $0, R32(%rax) 153 clc C clear cf 154 jz L(end) C depends on old shift 155 156 ALIGN(16) 157L(top): adc %rax, %r9 158 lea 32(rp), rp 159 adc %r8, %r11 160 mulx( 16,(up), %r13, %r12) 161 mov (rp), %r8 162 mulx( 24,(up), %rbx, %rax) 163 lea 32(up), up 164 adc %r10, %r13 165 adc %r12, %rbx 166 adc $0, %rax 167 mov 8(rp), %r10 168 mov 16(rp), %r12 169 ADDSUB %r9, %r8 170 mov 24(rp), %rcx 171 mov %r8, (rp) 172 ADCSBB %r11, %r10 173L(lo1): mulx( (up), %r9, %r8) 174 mov %r10, 8(rp) 175 ADCSBB %r13, %r12 176L(lo0): mov %r12, 16(rp) 177 ADCSBB %rbx, %rcx 178L(lo3): mulx( 8,(up), %r11, %r10) 179 mov %rcx, 24(rp) 180 dec n 181 jnz L(top) 182 183L(end): adc %rax, %r9 184 adc %r8, %r11 185 mov 32(rp), %r8 186 mov %r10, %rax 187 adc $0, %rax 188 mov 40(rp), %r10 189 ADDSUB %r9, %r8 190 mov %r8, 32(rp) 191 ADCSBB %r11, %r10 192 mov %r10, 40(rp) 193 adc $0, %rax 194 195L(ret): pop %r13 196 pop %r12 197 pop %rbp 198 pop %rbx 199 FUNC_EXIT() 200 ret 201EPILOGUE() 202