1dnl AMD64 mpn_mul_2 optimised for Intel Haswell. 2 3dnl Contributed to the GNU project by Torbj��rn Granlund. 4 5dnl Copyright 2013 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb 36C AMD K8,K9 - 37C AMD K10 - 38C AMD bull - 39C AMD pile - 40C AMD steam - 41C AMD excavator - 42C AMD bobcat - 43C AMD jaguar - 44C Intel P4 - 45C Intel core - 46C Intel NHM - 47C Intel SBR - 48C Intel IBR - 49C Intel HWL 3.74 50C Intel BWL 4.21 51C Intel SKL 4.20 52C Intel atom - 53C Intel SLM - 54C VIA nano - 55 56C The loop of this code is the result of running a code generation and 57C optimisation tool suite written by David Harvey and Torbj��rn Granlund. 58 59C TODO 60C * Move test and jcc together, for insn fusion. 61 62define(`rp', `%rdi') 63define(`up', `%rsi') 64define(`n_param',`%rdx') 65define(`vp', `%rcx') 66 67define(`v0', `%r8') 68define(`v1', `%r9') 69define(`w0', `%rbx') 70define(`w1', `%rcx') 71define(`w2', `%rbp') 72define(`w3', `%r10') 73define(`n', `%r11') 74 75ABI_SUPPORT(DOS64) 76ABI_SUPPORT(STD64) 77 78ASM_START() 79 TEXT 80 ALIGN(32) 81PROLOGUE(mpn_mul_2) 82 FUNC_ENTRY(4) 83 push %rbx 84 push %rbp 85 86 mov (vp), v0 87 mov 8(vp), v1 88 89 lea 3(n_param), n 90 shr $2, n 91 92 test $1, R8(n_param) 93 jnz L(bx1) 94 95L(bx0): xor w0, w0 96 test $2, R8(n_param) 97 mov (up), %rdx 98 mulx( v0, w2, w1) 99 jz L(lo0) 100 101L(b10): lea -16(rp), rp 102 lea -16(up), up 103 jmp L(lo2) 104 105L(bx1): xor w2, w2 106 test $2, R8(n_param) 107 mov (up), %rdx 108 mulx( v0, w0, w3) 109 jnz L(b11) 110 111L(b01): lea -24(rp), rp 112 lea 8(up), up 113 jmp L(lo1) 114 115L(b11): lea -8(rp), rp 116 lea -8(up), up 117 jmp L(lo3) 118 119 ALIGN(16) 120L(top): mulx( v1, %rax, w0) 121 add %rax, w2 C 0 122 mov (up), %rdx 123 mulx( v0, %rax, w1) 124 adc $0, w0 C 1 125 add %rax, w2 C 0 126 adc $0, w1 C 1 127 add w3, w2 C 0 128L(lo0): mov w2, (rp) C 0 129 adc $0, w1 C 1 130 mulx( v1, %rax, w2) 131 add %rax, w0 C 1 132 mov 8(up), %rdx 133 adc $0, w2 C 2 134 mulx( v0, %rax, w3) 135 add %rax, w0 C 1 136 adc $0, w3 C 2 137 add w1, w0 C 1 138L(lo3): mov w0, 8(rp) C 1 139 adc $0, w3 C 2 140 mulx( v1, %rax, w0) 141 add %rax, w2 C 2 142 mov 16(up), %rdx 143 mulx( v0, %rax, w1) 144 adc $0, w0 C 3 145 add %rax, w2 C 2 146 adc $0, w1 C 3 147 add w3, w2 C 2 148L(lo2): mov w2, 16(rp) C 2 149 adc $0, w1 C 3 150 mulx( v1, %rax, w2) 151 add %rax, w0 C 3 152 mov 24(up), %rdx 153 adc $0, w2 C 4 154 mulx( v0, %rax, w3) 155 add %rax, w0 C 3 156 adc $0, w3 C 4 157 add w1, w0 C 3 158 lea 32(up), up 159L(lo1): mov w0, 24(rp) C 3 160 adc $0, w3 C 4 161 dec n 162 lea 32(rp), rp 163 jnz L(top) 164 165L(end): mulx( v1, %rdx, %rax) 166 add %rdx, w2 167 adc $0, %rax 168 add w3, w2 169 mov w2, (rp) 170 adc $0, %rax 171 172 pop %rbp 173 pop %rbx 174 FUNC_EXIT() 175 ret 176EPILOGUE() 177