mul_2.asm revision 1.1.1.1
1dnl AMD64 mpn_mul_2 optimised for Intel Sandy Bridge. 2 3dnl Contributed to the GNU project by Torbj��rn Granlund. 4 5dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb best 36C AMD K8,K9 37C AMD K10 38C AMD bull 39C AMD pile 40C AMD bobcat 41C AMD jaguar 42C Intel P4 43C Intel core 44C Intel NHM 45C Intel SBR 2.57 2.52 using 4-way code 46C Intel IBR 2.35 2.32 using 4-way code 47C Intel HWL 2.02 1.86 48C Intel BWL 49C Intel atom 50C VIA nano 51 52C This code is the result of running a code generation and optimisation tool 53C suite written by David Harvey and Torbjorn Granlund. 54 55C When playing with pointers, set this to $2 to fall back to conservative 56C indexing in wind-down code. 57define(`I',`$1') 58 59define(`rp', `%rdi') C rcx 60define(`up', `%rsi') C rdx 61define(`n_param', `%rdx') C r8 62define(`vp', `%rcx') C r9 63 64define(`n', `%rcx') 65define(`v0', `%rbx') 66define(`v1', `%rbp') 67 68define(`w0', `%r8') 69define(`w1', `%r9') 70define(`w2', `%r10') 71define(`w3', `%r11') 72 73ABI_SUPPORT(DOS64) 74ABI_SUPPORT(STD64) 75 76ASM_START() 77 TEXT 78 ALIGN(32) 79PROLOGUE(mpn_mul_2) 80 FUNC_ENTRY(4) 81 push %rbx 82 push %rbp 83 84 mov (vp), v0 85 mov 8(vp), v1 86 87 mov (up), %rax 88 lea (up,n_param,8), up 89 lea (rp,n_param,8), rp 90 91 test $1, R8(n_param) 92 jnz L(b1) 93 94L(b0): mov $0, R32(n) 95 sub n_param, n 96 xor w0, w0 97 mul v0 98 mov %rax, w2 99 mov %rdx, w1 100 mov (up,n,8), %rax 101 jmp L(lo0) 102 103L(b1): mov $1, R32(n) 104 sub n_param, n 105 xor w2, w2 106 mul v0 107 mov %rax, w0 108 mov %rdx, w3 109 mov -8(up,n,8), %rax 110 mul v1 111 jmp L(lo1) 112 113 ALIGN(32) 114L(top): mul v0 115 add %rax, w0 C 1 116 mov %rdx, w3 C 2 117 adc $0, w3 C 2 118 mov -8(up,n,8), %rax 119 mul v1 120 add w1, w0 C 1 121 adc $0, w3 C 2 122L(lo1): add %rax, w2 C 2 123 mov w0, -8(rp,n,8) C 1 124 mov %rdx, w0 C 3 125 adc $0, w0 C 3 126 mov (up,n,8), %rax 127 mul v0 128 add %rax, w2 C 2 129 mov %rdx, w1 C 3 130 adc $0, w1 C 3 131 add w3, w2 C 2 132 mov (up,n,8), %rax 133 adc $0, w1 C 1 134L(lo0): mul v1 135 mov w2, (rp,n,8) C 2 136 add %rax, w0 C 3 137 mov %rdx, w2 C 4 138 mov 8(up,n,8), %rax 139 adc $0, w2 C 4 140 add $2, n 141 jnc L(top) 142 143L(end): mul v0 144 add %rax, w0 145 mov %rdx, w3 146 adc $0, w3 147 mov I(-8(up),-8(up,n,8)), %rax 148 mul v1 149 add w1, w0 150 adc $0, w3 151 add %rax, w2 152 mov w0, I(-8(rp),-8(rp,n,8)) 153 adc $0, %rdx 154 add w3, w2 155 mov w2, I((rp),(rp,n,8)) 156 adc $0, %rdx 157 mov %rdx, %rax 158 159 pop %rbp 160 pop %rbx 161 FUNC_EXIT() 162 ret 163EPILOGUE() 164