1dnl AMD64 mpn_addmul_1 and mpn_submul_1. 2 3dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C cycles/limb 34C AMD K8,K9 2.52 35C AMD K10 2.51 36C AMD bd1 4.43 37C AMD bd2 5.03 5.63 38C AMD bd3 ? 39C AMD bd4 ? 40C AMD zen ? 41C AMD bobcat 6.20 42C AMD jaguar 5.57 6.56 43C Intel P4 14.9 17.1 44C Intel core2 5.15 45C Intel NHM 4.93 46C Intel SBR 3.95 47C Intel IBR 3.75 48C Intel HWL 3.62 49C Intel BWL 2.53 50C Intel SKL 2.53 51C Intel atom 21.3 52C Intel SLM 9.0 53C VIA nano 5.0 54 55C The loop of this code is the result of running a code generation and 56C optimization tool suite written by David Harvey and Torbjorn Granlund. 57 58C TODO 59C * The loop is great, but the prologue and epilogue code was quickly written. 60C Tune it! 61 62define(`rp', `%rdi') C rcx 63define(`up', `%rsi') C rdx 64define(`n_param', `%rdx') C r8 65define(`vl', `%rcx') C r9 66 67define(`n', `%r11') 68 69ifdef(`OPERATION_addmul_1',` 70 define(`ADDSUB', `add') 71 define(`func', `mpn_addmul_1') 72') 73ifdef(`OPERATION_submul_1',` 74 define(`ADDSUB', `sub') 75 define(`func', `mpn_submul_1') 76') 77 78ABI_SUPPORT(DOS64) 79ABI_SUPPORT(STD64) 80 81MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) 82 83IFDOS(` define(`up', ``%rsi'') ') dnl 84IFDOS(` define(`rp', ``%rcx'') ') dnl 85IFDOS(` define(`vl', ``%r9'') ') dnl 86IFDOS(` define(`r9', ``rdi'') ') dnl 87IFDOS(` define(`n', ``%r8'') ') dnl 88IFDOS(` define(`r8', ``r11'') ') dnl 89 90ASM_START() 91 TEXT 92 ALIGN(16) 93PROLOGUE(func) 94 95IFDOS(``push %rsi '') 96IFDOS(``push %rdi '') 97IFDOS(``mov %rdx, %rsi '') 98 99 mov (up), %rax C read first u limb early 100 push %rbx 101IFSTD(` mov n_param, %rbx ') C move away n from rdx, mul uses it 102IFDOS(` mov n, %rbx ') 103 mul vl 104IFSTD(` mov %rbx, n ') 105 106 and $3, R32(%rbx) 107 jz L(b0) 108 cmp $2, R32(%rbx) 109 jz L(b2) 110 jg L(b3) 111 112L(b1): dec n 113 jne L(gt1) 114 ADDSUB %rax, (rp) 115 jmp L(ret) 116L(gt1): lea 8(up,n,8), up 117 lea -8(rp,n,8), rp 118 neg n 119 xor %r10, %r10 120 xor R32(%rbx), R32(%rbx) 121 mov %rax, %r9 122 mov (up,n,8), %rax 123 mov %rdx, %r8 124 jmp L(L1) 125 126L(b0): lea (up,n,8), up 127 lea -16(rp,n,8), rp 128 neg n 129 xor %r10, %r10 130 mov %rax, %r8 131 mov %rdx, %rbx 132 jmp L(L0) 133 134L(b3): lea -8(up,n,8), up 135 lea -24(rp,n,8), rp 136 neg n 137 mov %rax, %rbx 138 mov %rdx, %r10 139 jmp L(L3) 140 141L(b2): lea -16(up,n,8), up 142 lea -32(rp,n,8), rp 143 neg n 144 xor %r8, %r8 145 xor R32(%rbx), R32(%rbx) 146 mov %rax, %r10 147 mov 24(up,n,8), %rax 148 mov %rdx, %r9 149 jmp L(L2) 150 151 ALIGN(16) 152L(top): ADDSUB %r10, (rp,n,8) 153 adc %rax, %r9 154 mov (up,n,8), %rax 155 adc %rdx, %r8 156 mov $0, R32(%r10) 157L(L1): mul vl 158 ADDSUB %r9, 8(rp,n,8) 159 adc %rax, %r8 160 adc %rdx, %rbx 161L(L0): mov 8(up,n,8), %rax 162 mul vl 163 ADDSUB %r8, 16(rp,n,8) 164 adc %rax, %rbx 165 adc %rdx, %r10 166L(L3): mov 16(up,n,8), %rax 167 mul vl 168 ADDSUB %rbx, 24(rp,n,8) 169 mov $0, R32(%r8) C zero 170 mov %r8, %rbx C zero 171 adc %rax, %r10 172 mov 24(up,n,8), %rax 173 mov %r8, %r9 C zero 174 adc %rdx, %r9 175L(L2): mul vl 176 add $4, n 177 js L(top) 178 179 ADDSUB %r10, (rp,n,8) 180 adc %rax, %r9 181 adc %r8, %rdx 182 ADDSUB %r9, 8(rp,n,8) 183L(ret): adc $0, %rdx 184 mov %rdx, %rax 185 186 pop %rbx 187IFDOS(``pop %rdi '') 188IFDOS(``pop %rsi '') 189 ret 190EPILOGUE() 191