1dnl AMD64 mpn_addmul_1/mpn_submul_1 optimised for Intel Atom. 2 3dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C cycles/limb 34C AMD K8,K9 4.5 35C AMD K10 4.5 36C AMD bull 4.73 37C AMD pile 4.60 4.80 38C AMD steam 39C AMD excavator 40C AMD bobcat 5.48 41C AMD jaguar 5.61 42C Intel P4 16.6 43C Intel core2 5.09 44C Intel NHM 4.79 45C Intel SBR 3.88 46C Intel IBR 3.65 47C Intel HWL 3.53 48C Intel BWL 2.75 49C Intel SKL 2.76 50C Intel atom 19.4 51C Intel SLM 8 52C VIA nano 53 54C The loop of this code is the result of running a code generation and 55C optimisation tool suite written by David Harvey and Torbjorn Granlund. 56 57define(`rp', `%rdi') C rcx 58define(`up', `%rsi') C rdx 59define(`n_param', `%rdx') C r8 60define(`v0', `%rcx') C r9 61 62define(`n', `%rbx') 63 64ifdef(`OPERATION_addmul_1',` 65 define(`ADDSUB', `add') 66 define(`func', `mpn_addmul_1') 67') 68ifdef(`OPERATION_submul_1',` 69 define(`ADDSUB', `sub') 70 define(`func', `mpn_submul_1') 71') 72 73ABI_SUPPORT(DOS64) 74ABI_SUPPORT(STD64) 75 76MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) 77 78ASM_START() 79 TEXT 80 ALIGN(16) 81PROLOGUE(func) 82 FUNC_ENTRY(4) 83 push %rbx 84 85 mov (up), %rax 86 lea -8(up,n_param,8), up 87 lea -16(rp,n_param,8), rp 88 89 test $1, R8(n_param) 90 jnz L(bx1) 91 92L(bx0): test $2, R8(n_param) 93 jnz L(b10) 94 95L(b00): mov $1, R32(n) 96 sub n_param, n 97 mul v0 98 mov %rax, %r11 99 mov 8(up,n,8), %rax 100 mov %rdx, %r10 101 mul v0 102 mov %rax, %r8 103 mov 16(up,n,8), %rax 104 jmp L(lo0) 105 106L(b10): mov $3, R32(n) 107 sub n_param, n 108 mul v0 109 mov %rax, %r11 110 mov -8(up,n,8), %rax 111 mov %rdx, %r10 112 mul v0 113 test n, n 114 jns L(cj2) 115 mov %rax, %r8 116 mov (up,n,8), %rax 117 mov %rdx, %r9 118 jmp L(lo2) 119 120L(bx1): test $2, R8(n_param) 121 jnz L(b11) 122 123L(b01): mov $2, R32(n) 124 sub n_param, n 125 mul v0 126 test n, n 127 jns L(cj1) 128 mov %rax, %r8 129 mov (up,n,8), %rax 130 mov %rdx, %r9 131 mul v0 132 mov %rax, %r11 133 mov 8(up,n,8), %rax 134 mov %rdx, %r10 135 jmp L(lo1) 136 137L(b11): xor R32(n), R32(n) 138 sub n_param, n 139 mul v0 140 mov %rax, %r8 141 mov 16(up,n,8), %rax 142 mov %rdx, %r9 143 mul v0 144 mov %rax, %r11 145 mov 24(up,n,8), %rax 146 jmp L(lo3) 147 148 ALIGN(16) 149L(top): mul v0 150 ADDSUB %r8, -16(rp,n,8) 151 mov %rax, %r8 152 mov (up,n,8), %rax 153 adc %r9, %r11 154 mov %rdx, %r9 155 adc $0, %r10 156L(lo2): mul v0 157 ADDSUB %r11, -8(rp,n,8) 158 mov %rax, %r11 159 mov 8(up,n,8), %rax 160 adc %r10, %r8 161 mov %rdx, %r10 162 adc $0, %r9 163L(lo1): mul v0 164 ADDSUB %r8, (rp,n,8) 165 mov %rax, %r8 166 adc %r9, %r11 167 mov 16(up,n,8), %rax 168 adc $0, %r10 169L(lo0): mov %rdx, %r9 170 mul v0 171 ADDSUB %r11, 8(rp,n,8) 172 mov %rax, %r11 173 adc %r10, %r8 174 mov 24(up,n,8), %rax 175 adc $0, %r9 176L(lo3): add $4, n 177 mov %rdx, %r10 178 js L(top) 179 180L(end): mul v0 181 ADDSUB %r8, -16(rp,n,8) 182 adc %r9, %r11 183 adc $0, %r10 184L(cj2): ADDSUB %r11, -8(rp,n,8) 185 adc %r10, %rax 186 adc $0, %rdx 187L(cj1): ADDSUB %rax, (rp,n,8) 188 mov $0, R32(%rax) 189 adc %rdx, %rax 190 pop %rbx 191 FUNC_EXIT() 192 ret 193EPILOGUE() 194ASM_END() 195