1dnl AMD64 mpn_addmul_1 and mpn_submul_1 for CPUs with mulx. 2 3dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C cycles/limb 34C AMD K8,K9 - 35C AMD K10 - 36C AMD bd1 - 37C AMD bd2 - 38C AMD bd3 - 39C AMD bd4 4.3 40C AMD zen 2 41C AMD bt1 - 42C AMD bt2 - 43C Intel P4 - 44C Intel PNR - 45C Intel NHM - 46C Intel SBR - 47C Intel IBR - 48C Intel HWL ? 49C Intel BWL ? 50C Intel SKL ? 51C Intel atom - 52C Intel SLM - 53C VIA nano - 54 55define(`rp', `%rdi') C rcx 56define(`up', `%rsi') C rdx 57define(`n_param', `%rdx') C r8 58define(`v0_param',`%rcx') C r9 59 60define(`n', `%rcx') 61define(`v0', `%rdx') 62 63ifdef(`OPERATION_addmul_1',` 64 define(`ADDSUB', `add') 65 define(`ADCSBB', `adc') 66 define(`func', `mpn_addmul_1') 67') 68ifdef(`OPERATION_submul_1',` 69 define(`ADDSUB', `sub') 70 define(`ADCSBB', `sbb') 71 define(`func', `mpn_submul_1') 72') 73 74ABI_SUPPORT(DOS64) 75ABI_SUPPORT(STD64) 76 77MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) 78 79ASM_START() 80 TEXT 81 ALIGN(16) 82PROLOGUE(func) 83 FUNC_ENTRY(4) 84 mov (up), %r8 85 86 push %rbx 87 push %r12 88 push %r13 89 90 lea (up,n_param,8), up 91 lea -32(rp,n_param,8), rp 92 mov R32(n_param), R32(%rax) 93 xchg v0_param, v0 C FIXME: is this insn fast? 94 95 neg n 96 97 and $3, R8(%rax) 98 jz L(b0) 99 cmp $2, R8(%rax) 100 jz L(b2) 101 jg L(b3) 102 103L(b1): mulx( %r8, %rbx, %rax) 104 sub $-1, n 105 jz L(wd1) 106 .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 107 .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 108 test R32(%rax), R32(%rax) C clear cy 109 jmp L(lo1) 110 111L(b0): mulx( %r8, %r9, %r8) 112 .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 113 .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 114 xor R32(%rax), R32(%rax) 115 jmp L(lo0) 116 117L(b3): mulx( %r8, %r11, %r10) 118 .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x08 C mulx 8(up,n,8), %r13, %r12 119 .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x10 C mulx 16(up,n,8), %rbx, %rax 120 add %r10, %r13 121 adc %r12, %rbx 122 adc $0, %rax 123 sub $-3, n 124 jz L(wd3) 125 test R32(%rax), R32(%rax) C clear cy 126 jmp L(lo3) 127 128L(b2): mulx( %r8, %r13, %r12) 129 .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x08 C mulx 8(up,n,8), %rbx, %rax 130 add %r12, %rbx 131 adc $0, %rax 132 sub $-2, n 133 jz L(wd2) 134 .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 135 test R32(%rax), R32(%rax) C clear cy 136 jmp L(lo2) 137 138L(top): ADDSUB %r9, (rp,n,8) 139L(lo3): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 140 ADCSBB %r11, 8(rp,n,8) 141L(lo2): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 142 ADCSBB %r13, 16(rp,n,8) 143L(lo1): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 144 ADCSBB %rbx, 24(rp,n,8) 145 adc %rax, %r9 146L(lo0): .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax 147 adc %r8, %r11 148 adc %r10, %r13 149 adc %r12, %rbx 150 adc $0, %rax C rax = carry limb 151 add $4, n 152 js L(top) 153 154L(end): ADDSUB %r9, (rp) 155L(wd3): ADCSBB %r11, 8(rp) 156L(wd2): ADCSBB %r13, 16(rp) 157L(wd1): ADCSBB %rbx, 24(rp) 158 adc n, %rax 159 pop %r13 160 pop %r12 161 pop %rbx 162 FUNC_EXIT() 163 ret 164EPILOGUE() 165ASM_END() 166