1dnl AMD64 mpn_mul_1 using mulx optimised for Intel Haswell. 2 3dnl Contributed to the GNU project by Torbj��rn Granlund. 4 5dnl Copyright 2012, 2013 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb 36C AMD K8,K9 - 37C AMD K10 - 38C AMD bull - 39C AMD pile - 40C AMD steam - 41C AMD excavator - 42C AMD bobcat - 43C AMD jaguar - 44C Intel P4 - 45C Intel core2 - 46C Intel NHM - 47C Intel SBR - 48C Intel IBR - 49C Intel HWL 1.59 50C Intel BWL 1.76 51C Intel SKL 1.54 52C Intel atom - 53C Intel SLM - 54C VIA nano - 55 56C The loop of this code is the result of running a code generation and 57C optimisation tool suite written by David Harvey and Torbjorn Granlund. 58 59define(`rp', `%rdi') C rcx 60define(`up', `%rsi') C rdx 61define(`n_param', `%rdx') C r8 62define(`v0_param',`%rcx') C r9 63 64define(`n', `%rbp') 65define(`v0', `%rdx') 66 67ABI_SUPPORT(DOS64) 68ABI_SUPPORT(STD64) 69 70ASM_START() 71 TEXT 72 ALIGN(32) 73PROLOGUE(mpn_mul_1) 74 FUNC_ENTRY(4) 75 push %rbx 76 push %rbp 77 push %r12 78 79 mov n_param, n 80 shr $2, n 81 82 test $1, R8(n_param) 83 jnz L(bx1) 84 85L(bx0): test $2, R8(n_param) 86 mov v0_param, v0 87 jnz L(b10) 88 89L(b00): mulx( (up), %r9, %r8) 90 mulx( 8,(up), %r11, %r10) 91 mulx( 16,(up), %rcx, %r12) 92 lea -32(rp), rp 93 jmp L(lo0) 94 95L(b10): mulx( (up), %rcx, %r12) 96 mulx( 8,(up), %rbx, %rax) 97 lea -16(rp), rp 98 test n, n 99 jz L(cj2) 100 mulx( 16,(up), %r9, %r8) 101 lea 16(up), up 102 jmp L(lo2) 103 104L(bx1): test $2, R8(n_param) 105 mov v0_param, v0 106 jnz L(b11) 107 108L(b01): mulx( (up), %rbx, %rax) 109 lea -24(rp), rp 110 test n, n 111 jz L(cj1) 112 mulx( 8,(up), %r9, %r8) 113 lea 8(up), up 114 jmp L(lo1) 115 116L(b11): mulx( (up), %r11, %r10) 117 mulx( 8,(up), %rcx, %r12) 118 mulx( 16,(up), %rbx, %rax) 119 lea -8(rp), rp 120 test n, n 121 jz L(cj3) 122 lea 24(up), up 123 jmp L(lo3) 124 125 ALIGN(32) 126L(top): lea 32(rp), rp 127 mov %r9, (rp) 128 adc %r8, %r11 129L(lo3): mulx( (up), %r9, %r8) 130 mov %r11, 8(rp) 131 adc %r10, %rcx 132L(lo2): mov %rcx, 16(rp) 133 adc %r12, %rbx 134L(lo1): mulx( 8,(up), %r11, %r10) 135 adc %rax, %r9 136 mulx( 16,(up), %rcx, %r12) 137 mov %rbx, 24(rp) 138L(lo0): mulx( 24,(up), %rbx, %rax) 139 lea 32(up), up 140 dec n 141 jnz L(top) 142 143L(end): lea 32(rp), rp 144 mov %r9, (rp) 145 adc %r8, %r11 146L(cj3): mov %r11, 8(rp) 147 adc %r10, %rcx 148L(cj2): mov %rcx, 16(rp) 149 adc %r12, %rbx 150L(cj1): mov %rbx, 24(rp) 151 adc $0, %rax 152 153 pop %r12 154 pop %rbp 155 pop %rbx 156 FUNC_EXIT() 157 ret 158EPILOGUE() 159ASM_END() 160