1dnl AMD64 mpn_mul_1. 2 3dnl Copyright 2003-2005, 2007, 2008, 2012 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C cycles/limb 34C AMD K8,K9 2.54 35C AMD K10 2.54 36C AMD bull 4.98 37C AMD pile 4.80 38C AMD steam 39C AMD excavator 40C AMD bobcat 5.37 41C AMD jaguar 6.16 42C Intel P4 12.6 43C Intel core2 4.05 44C Intel NHM 4.0 45C Intel SBR 2.91 46C Intel IBR 2.73 47C Intel HWL 2.44 48C Intel BWL 2.39 49C Intel SKL 2.44 50C Intel atom 19.8 51C Intel SLM 9.0 52C VIA nano 4.25 53 54C The loop of this code is the result of running a code generation and 55C optimization tool suite written by David Harvey and Torbjorn Granlund. 56 57C TODO 58C * The loop is great, but the prologue and epilogue code was quickly written. 59C Tune it! 60 61define(`rp', `%rdi') C rcx 62define(`up', `%rsi') C rdx 63define(`n_param', `%rdx') C r8 64define(`vl', `%rcx') C r9 65 66define(`n', `%r11') 67 68ABI_SUPPORT(DOS64) 69ABI_SUPPORT(STD64) 70 71IFDOS(` define(`up', ``%rsi'') ') dnl 72IFDOS(` define(`rp', ``%rcx'') ') dnl 73IFDOS(` define(`vl', ``%r9'') ') dnl 74IFDOS(` define(`r9', ``rdi'') ') dnl 75IFDOS(` define(`n', ``%r8'') ') dnl 76IFDOS(` define(`r8', ``r11'') ') dnl 77 78ASM_START() 79 TEXT 80 ALIGN(16) 81PROLOGUE(mpn_mul_1c) 82IFDOS(``push %rsi '') 83IFDOS(``push %rdi '') 84IFDOS(``mov %rdx, %rsi '') 85 push %rbx 86IFSTD(` mov %r8, %r10') 87IFDOS(` mov 64(%rsp), %r10') C 40 + 3*8 (3 push insns) 88 jmp L(common) 89EPILOGUE() 90 91PROLOGUE(mpn_mul_1) 92IFDOS(``push %rsi '') 93IFDOS(``push %rdi '') 94IFDOS(``mov %rdx, %rsi '') 95 96 push %rbx 97 xor %r10, %r10 98L(common): 99 mov (up), %rax C read first u limb early 100IFSTD(` mov n_param, %rbx ') C move away n from rdx, mul uses it 101IFDOS(` mov n, %rbx ') 102 mul vl 103IFSTD(` mov %rbx, n ') 104 105 add %r10, %rax 106 adc $0, %rdx 107 108 and $3, R32(%rbx) 109 jz L(b0) 110 cmp $2, R32(%rbx) 111 jz L(b2) 112 jg L(b3) 113 114L(b1): dec n 115 jne L(gt1) 116 mov %rax, (rp) 117 jmp L(ret) 118L(gt1): lea 8(up,n,8), up 119 lea -8(rp,n,8), rp 120 neg n 121 xor %r10, %r10 122 xor R32(%rbx), R32(%rbx) 123 mov %rax, %r9 124 mov (up,n,8), %rax 125 mov %rdx, %r8 126 jmp L(L1) 127 128L(b0): lea (up,n,8), up 129 lea -16(rp,n,8), rp 130 neg n 131 xor %r10, %r10 132 mov %rax, %r8 133 mov %rdx, %rbx 134 jmp L(L0) 135 136L(b3): lea -8(up,n,8), up 137 lea -24(rp,n,8), rp 138 neg n 139 mov %rax, %rbx 140 mov %rdx, %r10 141 jmp L(L3) 142 143L(b2): lea -16(up,n,8), up 144 lea -32(rp,n,8), rp 145 neg n 146 xor %r8, %r8 147 xor R32(%rbx), R32(%rbx) 148 mov %rax, %r10 149 mov 24(up,n,8), %rax 150 mov %rdx, %r9 151 jmp L(L2) 152 153 ALIGN(16) 154L(top): mov %r10, (rp,n,8) 155 add %rax, %r9 156 mov (up,n,8), %rax 157 adc %rdx, %r8 158 mov $0, R32(%r10) 159L(L1): mul vl 160 mov %r9, 8(rp,n,8) 161 add %rax, %r8 162 adc %rdx, %rbx 163L(L0): mov 8(up,n,8), %rax 164 mul vl 165 mov %r8, 16(rp,n,8) 166 add %rax, %rbx 167 adc %rdx, %r10 168L(L3): mov 16(up,n,8), %rax 169 mul vl 170 mov %rbx, 24(rp,n,8) 171 mov $0, R32(%r8) C zero 172 mov %r8, %rbx C zero 173 add %rax, %r10 174 mov 24(up,n,8), %rax 175 mov %r8, %r9 C zero 176 adc %rdx, %r9 177L(L2): mul vl 178 add $4, n 179 js L(top) 180 181 mov %r10, (rp,n,8) 182 add %rax, %r9 183 adc %r8, %rdx 184 mov %r9, 8(rp,n,8) 185 add %r8, %rdx 186L(ret): mov %rdx, %rax 187 188 pop %rbx 189IFDOS(``pop %rdi '') 190IFDOS(``pop %rsi '') 191 ret 192EPILOGUE() 193