1dnl AMD64 mpn_mul_1 optimised for Intel Broadwell. 2 3dnl Copyright 2015 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C cycles/limb 34C AMD K8,K9 - 35C AMD K10 - 36C AMD bull - 37C AMD pile - 38C AMD steam - 39C AMD excavator - 40C AMD bobcat - 41C AMD jaguar - 42C Intel P4 - 43C Intel core2 - 44C Intel NHM - 45C Intel SBR - 46C Intel IBR - 47C Intel HWL 1.70 48C Intel BWL 1.51 49C Intel SKL 1.52 50C Intel atom - 51C Intel SLM - 52C VIA nano - 53 54C The loop of this code is the result of running a code generation and 55C optimisation tool suite written by David Harvey and Torbjorn Granlund. 56 57C TODO 58C * Put an initial mulx before switching, targeting some free registers. 59C * Tune feed-in code. 60C * Trim nop execution after L(f2). 61C * Port to DOS64, not forgetting nop execution. 62 63define(`rp', `%rdi') C rcx 64define(`up', `%rsi') C rdx 65define(`n_param', `%rdx') C r8 66define(`v0_param',`%rcx') C r9 67 68define(`n', `%rcx') 69 70dnl ABI_SUPPORT(DOS64) 71ABI_SUPPORT(STD64) 72 73dnl IFDOS(` define(`up', ``%rsi'') ') dnl 74dnl IFDOS(` define(`rp', ``%rcx'') ') dnl 75dnl IFDOS(` define(`vl', ``%r9'') ') dnl 76dnl IFDOS(` define(`r9', ``rdi'') ') dnl 77dnl IFDOS(` define(`n', ``%r8'') ') dnl 78dnl IFDOS(` define(`r8', ``r11'') ') dnl 79 80ASM_START() 81 TEXT 82 ALIGN(32) 83PROLOGUE(mpn_mul_1) 84 85 mov v0_param, %r10 86 mov n_param, n 87 mov R32(n_param), R32(%r8) 88 shr $3, n 89 and $7, R32(%r8) C clear OF, CF as side-effect 90 mov %r10, %rdx 91 lea L(tab)(%rip), %r10 92ifdef(`PIC', 93` movslq (%r10,%r8,4), %r8 94 lea (%r8, %r10), %r10 95 jmp *%r10 96',` 97 jmp *(%r10,%r8,8) 98') 99 JUMPTABSECT 100 ALIGN(8) 101L(tab): JMPENT( L(f0), L(tab)) 102 JMPENT( L(f1), L(tab)) 103 JMPENT( L(f2), L(tab)) 104 JMPENT( L(f3), L(tab)) 105 JMPENT( L(f4), L(tab)) 106 JMPENT( L(f5), L(tab)) 107 JMPENT( L(f6), L(tab)) 108 JMPENT( L(f7), L(tab)) 109 TEXT 110 111L(f0): mulx( (up), %r10, %r8) 112 lea 56(up), up 113 lea -8(rp), rp 114 jmp L(b0) 115 116L(f3): mulx( (up), %r9, %rax) 117 lea 16(up), up 118 lea 16(rp), rp 119 inc n 120 jmp L(b3) 121 122L(f4): mulx( (up), %r10, %r8) 123 lea 24(up), up 124 lea 24(rp), rp 125 inc n 126 jmp L(b4) 127 128L(f5): mulx( (up), %r9, %rax) 129 lea 32(up), up 130 lea 32(rp), rp 131 inc n 132 jmp L(b5) 133 134L(f6): mulx( (up), %r10, %r8) 135 lea 40(up), up 136 lea 40(rp), rp 137 inc n 138 jmp L(b6) 139 140L(f7): mulx( (up), %r9, %rax) 141 lea 48(up), up 142 lea 48(rp), rp 143 inc n 144 jmp L(b7) 145 146L(f1): mulx( (up), %r9, %rax) 147 test n, n 148 jnz L(b1) 149L(1): mov %r9, (rp) 150 ret 151 152L(f2): mulx( (up), %r10, %r8) 153 lea 8(up), up 154 lea 8(rp), rp 155 mulx( (up), %r9, %rax) 156 test n, n 157 jz L(end) 158 159 ALIGN(32) 160L(top): mov %r10, -8(rp) 161 adc %r8, %r9 162L(b1): mulx( 8,(up), %r10, %r8) 163 adc %rax, %r10 164 lea 64(up), up 165 mov %r9, (rp) 166L(b0): mov %r10, 8(rp) 167 mulx( -48,(up), %r9, %rax) 168 lea 64(rp), rp 169 adc %r8, %r9 170L(b7): mulx( -40,(up), %r10, %r8) 171 mov %r9, -48(rp) 172 adc %rax, %r10 173L(b6): mov %r10, -40(rp) 174 mulx( -32,(up), %r9, %rax) 175 adc %r8, %r9 176L(b5): mulx( -24,(up), %r10, %r8) 177 mov %r9, -32(rp) 178 adc %rax, %r10 179L(b4): mulx( -16,(up), %r9, %rax) 180 mov %r10, -24(rp) 181 adc %r8, %r9 182L(b3): mulx( -8,(up), %r10, %r8) 183 adc %rax, %r10 184 mov %r9, -16(rp) 185 dec n 186 mulx( (up), %r9, %rax) 187 jnz L(top) 188 189L(end): mov %r10, -8(rp) 190 adc %r8, %r9 191 mov %r9, (rp) 192 adc %rcx, %rax 193 ret 194EPILOGUE() 195ASM_END() 196