1279377Simpdnl AMD64 mpn_mul_1 optimised for AMD Bulldozer. 2279377Simp 3279377Simpdnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. 4279377Simp 5279377Simpdnl This file is part of the GNU MP Library. 6279377Simpdnl 7279377Simpdnl The GNU MP Library is free software; you can redistribute it and/or modify 8279377Simpdnl it under the terms of either: 9279377Simpdnl 10279377Simpdnl * the GNU Lesser General Public License as published by the Free 11279377Simpdnl Software Foundation; either version 3 of the License, or (at your 12279377Simpdnl option) any later version. 13279377Simpdnl 14279377Simpdnl or 15279377Simpdnl 16279377Simpdnl * the GNU General Public License as published by the Free Software 17279377Simpdnl Foundation; either version 2 of the License, or (at your option) any 18279377Simpdnl later version. 19279377Simpdnl 20279377Simpdnl or both in parallel, as here. 21279377Simpdnl 22279377Simpdnl The GNU MP Library is distributed in the hope that it will be useful, but 23279377Simpdnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24279377Simpdnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25279377Simpdnl for more details. 26279377Simpdnl 27279377Simpdnl You should have received copies of the GNU General Public License and the 28279377Simpdnl GNU Lesser General Public License along with the GNU MP Library. If not, 29279377Simpdnl see https://www.gnu.org/licenses/. 30279377Simp 31279377Simpinclude(`../config.m4') 32279377Simp 33279377SimpC cycles/limb 34279377SimpC AMD K8,K9 3.65 35279377SimpC AMD K10 3.30 3.68 36279377SimpC AMD bull 4.04 4.29 37279377SimpC AMD pile 4.33 38279377SimpC AMD steam 39279377SimpC AMD excavator 40279377SimpC AMD bobcat 5.73 41279377SimpC AMD jaguar 5.87 42279377SimpC Intel P4 12.5 43279377SimpC Intel core2 4.38 44279377SimpC Intel NHM 4.28 45279377SimpC Intel SBR 2.69 46279377SimpC Intel IBR 2.55 47279377SimpC Intel HWL 2.41 48279377SimpC Intel BWL 2.49 49279377SimpC Intel SKL 2.50 50279377SimpC Intel atom 20.3 51279377SimpC Intel SLM 7.8 52279377SimpC VIA nano 4.25 53279377Simp 54279377SimpC The loop of this code is the result of running a code generation and 55279377SimpC optimisation tool suite written by David Harvey and Torbjorn Granlund. 56279377Simp 57279377SimpC TODO 58279377SimpC * Move loop code into feed-in blocks, to save insn for zeroing regs. 59279377Simp 60279377Simpdefine(`rp', `%rdi') C rcx 61279377Simpdefine(`up', `%rsi') C rdx 62279377Simpdefine(`n_param', `%rdx') C r8 63279377Simpdefine(`v0', `%rcx') C r9 64279377Simp 65279377Simpdefine(`n', `%rbx') 66279377Simp 67279377SimpABI_SUPPORT(DOS64) 68279377SimpABI_SUPPORT(STD64) 69279377Simp 70279377SimpIFDOS(` define(`up', ``%rsi'') ') dnl 71279377SimpIFDOS(` define(`rp', ``%rcx'') ') dnl 72279377SimpIFDOS(` define(`v0', ``%r9'') ') dnl 73279377SimpIFDOS(` define(`r9', ``rdi'') ') dnl 74279377SimpIFDOS(` define(`n', ``%r8'') ') dnl 75279377SimpIFDOS(` define(`r8', ``rbx'') ') dnl 76279377Simp 77279377SimpASM_START() 78279377Simp TEXT 79279377Simp ALIGN(16) 80279377SimpPROLOGUE(mpn_mul_1c) 81279377SimpIFDOS(``push %rsi '') 82279377SimpIFDOS(``push %rdi '') 83279377SimpIFDOS(``mov %rdx, %rsi '') 84279377Simp 85279377Simp mov (up), %rax C read first u limb early 86279377Simp push %rbx 87279377SimpIFSTD(` mov n_param, %r11 ') C move away n from rdx, mul uses it 88279377SimpIFDOS(` mov n, %r11 ') 89279377Simp mul v0 90279377Simp 91279377SimpIFSTD(` add %r8, %rax ') 92279377SimpIFDOS(` add 64(%rsp), %rax ') C 40 + 3*8 (3 push insns) 93279377Simp adc $0, %rdx 94279377Simp jmp L(common) 95279377Simp 96279377SimpEPILOGUE() 97279377Simp 98279377Simp ALIGN(16) 99279377SimpPROLOGUE(mpn_mul_1) 100279377SimpIFDOS(``push %rsi '') 101279377SimpIFDOS(``push %rdi '') 102279377SimpIFDOS(``mov %rdx, %rsi '') 103279377Simp 104279377Simp mov (up), %rax C read first u limb early 105279377Simp push %rbx 106279377SimpIFSTD(` mov n_param, %r11 ') C move away n from rdx, mul uses it 107279377SimpIFDOS(` mov n, %r11 ') 108279377Simp mul v0 109279377Simp 110279377SimpL(common): 111279377SimpIFSTD(` mov %r11, n ') 112279377Simp 113279377Simp and $3, R32(%r11) 114279377Simp lea -16(rp,n,8), rp 115279377Simp jz L(b0) 116279377Simp cmp $2, R32(%r11) 117279377Simp jb L(b1) 118279377Simp jz L(b2) 119279377Simp 120279377SimpL(b3): mov %rax, %r10 121279377Simp mov %rdx, %r11 122279377Simp mov 8(up), %rax 123279377Simp mul v0 124279377Simp lea (up,n,8), up 125279377Simp not n 126279377Simp jmp L(L3) 127279377Simp 128279377SimpL(b0): mov %rax, %r9 129279377Simp mov %rdx, %r10 130279377Simp mov 8(up), %rax 131279377Simp lea (up,n,8), up 132279377Simp neg n 133279377Simp jmp L(L0) 134279377Simp 135279377SimpL(b1): mov %rax, %r8 136279377Simp cmp $1, n 137279377Simp jz L(n1) 138279377Simp mov %rdx, %r9 139279377Simp lea (up,n,8), up 140279377Simp neg n 141279377Simp mov %r8, 16(rp,n,8) 142279377Simp inc n 143279377Simp jmp L(L1) 144279377Simp 145279377SimpL(b2): mov %rax, %r11 146279377Simp mov %rdx, %r8 147279377Simp mov 8(up), %rax 148279377Simp lea (up,n,8), up 149279377Simp neg n 150279377Simp add $2, n 151279377Simp jns L(end) 152279377Simp 153279377Simp ALIGN(16) 154279377SimpL(top): mul v0 155279377Simp mov %rdx, %r9 156279377Simp add %rax, %r8 157279377Simp adc $0, %r9 158279377Simp mov %r8, 8(rp,n,8) 159279377Simp mov %r11, (rp,n,8) 160279377SimpL(L1): mov (up,n,8), %rax 161279377Simp mul v0 162279377Simp add %rax, %r9 163279377Simp mov %rdx, %r10 164279377Simp mov 8(up,n,8), %rax 165279377Simp adc $0, %r10 166279377SimpL(L0): mul v0 167279377Simp add %rax, %r10 168279377Simp mov %rdx, %r11 169279377Simp mov 16(up,n,8), %rax 170279377Simp adc $0, %r11 171279377Simp mul v0 172279377Simp mov %r9, 16(rp,n,8) 173279377SimpL(L3): add %rax, %r11 174279377Simp mov %r10, 24(rp,n,8) 175279377Simp mov %rdx, %r8 176279377Simp adc $0, %r8 177279377Simp add $4, n 178279377Simp mov -8(up,n,8), %rax 179279377Simp js L(top) 180279377Simp 181279377SimpL(end): mul v0 182279377Simp add %rax, %r8 183279377Simp adc $0, %rdx 184279377Simp mov %r11, (rp) 185279377SimpL(n1): mov %r8, 8(rp) 186279377Simp mov %rdx, %rax 187 188 pop %rbx 189IFDOS(``pop %rdi '') 190IFDOS(``pop %rsi '') 191 ret 192EPILOGUE() 193ASM_END() 194