1dnl X86-64 mpn_mul_1 optimised for Intel Sandy Bridge. 2 3dnl Contributed to the GNU project by Torbj��rn Granlund. 4 5dnl Copyright 2003-2005, 2007, 2008, 2011-2013, 2017 Free Software Foundation, 6dnl Inc. 7 8dnl This file is part of the GNU MP Library. 9dnl 10dnl The GNU MP Library is free software; you can redistribute it and/or modify 11dnl it under the terms of either: 12dnl 13dnl * the GNU Lesser General Public License as published by the Free 14dnl Software Foundation; either version 3 of the License, or (at your 15dnl option) any later version. 16dnl 17dnl or 18dnl 19dnl * the GNU General Public License as published by the Free Software 20dnl Foundation; either version 2 of the License, or (at your option) any 21dnl later version. 22dnl 23dnl or both in parallel, as here. 24dnl 25dnl The GNU MP Library is distributed in the hope that it will be useful, but 26dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 27dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 28dnl for more details. 29dnl 30dnl You should have received copies of the GNU General Public License and the 31dnl GNU Lesser General Public License along with the GNU MP Library. If not, 32dnl see https://www.gnu.org/licenses/. 33 34include(`../config.m4') 35 36C cycles/limb 37C AMD K8,K9 38C AMD K10 39C AMD bull 40C AMD pile 41C AMD steam 42C AMD excavator 43C AMD bobcat 44C AMD jaguar 45C Intel P4 46C Intel core2 47C Intel NHM 48C Intel SBR 2.49 49C Intel IBR 2.32 50C Intel HWL 2.44 51C Intel BWL 2.43 52C Intel SKL 2.47 53C Intel atom 54C Intel SLM 55C VIA nano 56 57C The loop of this code is the result of running a code generation and 58C optimisation tool suite written by David Harvey and Torbjorn Granlund. 59 60define(`rp', `%rdi') C rcx 61define(`up_param',`%rsi') C rdx 62define(`n_param', `%rdx') C r8 63define(`v0', `%rcx') C r9 64define(`cin', `%r8') C stack 65 66define(`up', `%rsi') C same as rp_param 67define(`n', `%r9') 68 69ABI_SUPPORT(DOS64) 70ABI_SUPPORT(STD64) 71 72IFDOS(` define(`rp', `%rcx')') 73IFDOS(` define(`up_param',`%rdx')') 74IFDOS(` define(`n_param', `%r8')') 75IFDOS(` define(`v0', `%r9')') 76IFDOS(` define(`cin', `48(%rsp)')') 77 78IFDOS(` define(`up', `%rsi')') 79IFDOS(` define(`n', `%r8')') 80 81ASM_START() 82 TEXT 83 ALIGN(16) 84PROLOGUE(mpn_mul_1) 85IFDOS(` push %rsi ') 86 mov (up_param), %rax 87IFSTD(` mov n_param, n ') 88 lea (up_param,n_param,8), up 89 lea -8(rp,n_param,8), rp 90 neg n 91 mul v0 92 93 test $1, R8(n) 94 jz L(x0) 95L(x1): mov %rax, %r11 96 mov %rdx, %r10 97 test $2, R8(n) 98 jnz L(01) 99 100L(11): mov 8(up,n,8), %rax 101 dec n 102 jmp L(L3) 103 104L(01): inc n 105 jnz L(L1) 106 mov %rax, (rp) 107 mov %rdx, %rax 108IFDOS(` pop %rsi ') 109 ret 110 111L(x0): mov %rax, %r10 112 mov %rdx, %r11 113 mov 8(up,n,8), %rax 114 test $2, R8(n) 115 jz L(L0) 116 117L(10): add $-2, n 118 jmp L(L2) 119 120 ALIGN(8) 121L(top): mov %rdx, %r10 122 add %rax, %r11 123L(L1): mov 0(up,n,8), %rax 124 adc $0, %r10 125 mul v0 126 add %rax, %r10 127 mov %r11, 0(rp,n,8) 128 mov 8(up,n,8), %rax 129 mov %rdx, %r11 130L(L0c): adc $0, %r11 131L(L0): mul v0 132 mov %r10, 8(rp,n,8) 133 add %rax, %r11 134 mov %rdx, %r10 135L(L3c): mov 16(up,n,8), %rax 136 adc $0, %r10 137L(L3): mul v0 138 mov %r11, 16(rp,n,8) 139 mov %rdx, %r11 140 add %rax, %r10 141L(L2c): mov 24(up,n,8), %rax 142 adc $0, %r11 143L(L2): mul v0 144 mov %r10, 24(rp,n,8) 145 add $4, n 146 jnc L(top) 147 148L(end): add %rax, %r11 149 mov %rdx, %rax 150 adc $0, %rax 151 mov %r11, (rp) 152 153IFDOS(` pop %rsi ') 154 ret 155EPILOGUE() 156 157 ALIGN(16) 158PROLOGUE(mpn_mul_1c) 159IFDOS(` push %rsi ') 160 mov (up_param), %rax 161IFSTD(` mov n_param, n ') 162 lea (up_param,n_param,8), up 163 lea -8(rp,n_param,8), rp 164 neg n 165 mul v0 166 167 test $1, R8(n) 168 jz L(x0c) 169L(x1c): mov %rax, %r11 170 mov %rdx, %r10 171 test $2, R8(n) 172 jnz L(01c) 173 174L(11c): add cin, %r11 175 dec n 176 jmp L(L3c) 177 178L(01c): add cin, %r11 179 inc n 180 jnz L(L1) 181 mov %r11, (rp) 182 mov %rdx, %rax 183 adc $0, %rax 184IFDOS(` pop %rsi ') 185 ret 186 187L(x0c): mov %rax, %r10 188 mov %rdx, %r11 189 test $2, R8(n) 190 jz L(00c) 191 192L(10c): add $-2, n 193 add cin, %r10 194 jmp L(L2c) 195 196L(00c): add cin, %r10 197 mov 8(up,n,8), %rax 198 jmp L(L0c) 199EPILOGUE() 200