1dnl AMD64 mpn_addlshC_n -- rp[] = up[] + (vp[] << C) 2dnl AMD64 mpn_rsblshC_n -- rp[] = (vp[] << C) - up[] 3 4dnl Copyright 2009-2012 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of either: 10dnl 11dnl * the GNU Lesser General Public License as published by the Free 12dnl Software Foundation; either version 3 of the License, or (at your 13dnl option) any later version. 14dnl 15dnl or 16dnl 17dnl * the GNU General Public License as published by the Free Software 18dnl Foundation; either version 2 of the License, or (at your option) any 19dnl later version. 20dnl 21dnl or both in parallel, as here. 22dnl 23dnl The GNU MP Library is distributed in the hope that it will be useful, but 24dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26dnl for more details. 27dnl 28dnl You should have received copies of the GNU General Public License and the 29dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30dnl see https://www.gnu.org/licenses/. 31 32 33C cycles/limb 34C AMD K8,K9 2.1 35C AMD K10 2.0 36C AMD bd1 ~2.7 37C AMD bd2 ~2.7 38C AMD bd3 ? 39C AMD bd4 ? 40C AMD zen 2.0 41C AMD bt1 3.3 42C AMD bt2 3.0 43C Intel P4 ? 44C Intel PNR 3.0 45C Intel NHM 2.75 46C Intel SBR 2.55 47C Intel IBR 2.49 48C Intel HWL 2.25 49C Intel BWL 1.89 50C Intel SKL 1.90 51C Intel atom 8.4 52C Intel SLM 4.0 53C VIA nano ? 54 55C INPUT PARAMETERS 56define(`rp', `%rdi') 57define(`up', `%rsi') 58define(`vp', `%rdx') 59define(`n', `%rcx') 60 61define(M, eval(m4_lshift(1,LSH))) 62 63ABI_SUPPORT(DOS64) 64ABI_SUPPORT(STD64) 65 66ASM_START() 67 TEXT 68 ALIGN(16) 69PROLOGUE(func) 70 FUNC_ENTRY(4) 71 push %r12 72 push %r13 73 push %r14 74 push %r15 75 76 mov (vp), %r8 77 lea (,%r8,M), %r12 78 shr $RSH, %r8 79 80 mov R32(n), R32(%rax) 81 lea (rp,n,8), rp 82 lea (up,n,8), up 83 lea (vp,n,8), vp 84 neg n 85 and $3, R8(%rax) 86 je L(b00) 87 cmp $2, R8(%rax) 88 jc L(b01) 89 je L(b10) 90 91L(b11): mov 8(vp,n,8), %r10 92 lea (%r8,%r10,M), %r14 93 shr $RSH, %r10 94 mov 16(vp,n,8), %r11 95 lea (%r10,%r11,M), %r15 96 shr $RSH, %r11 97 ADDSUB (up,n,8), %r12 98 ADCSBB 8(up,n,8), %r14 99 ADCSBB 16(up,n,8), %r15 100 sbb R32(%rax), R32(%rax) C save carry for next 101 mov %r12, (rp,n,8) 102 mov %r14, 8(rp,n,8) 103 mov %r15, 16(rp,n,8) 104 add $3, n 105 js L(top) 106 jmp L(end) 107 108L(b01): mov %r8, %r11 109 ADDSUB (up,n,8), %r12 110 sbb R32(%rax), R32(%rax) C save carry for next 111 mov %r12, (rp,n,8) 112 add $1, n 113 js L(top) 114 jmp L(end) 115 116L(b10): mov 8(vp,n,8), %r11 117 lea (%r8,%r11,M), %r15 118 shr $RSH, %r11 119 ADDSUB (up,n,8), %r12 120 ADCSBB 8(up,n,8), %r15 121 sbb R32(%rax), R32(%rax) C save carry for next 122 mov %r12, (rp,n,8) 123 mov %r15, 8(rp,n,8) 124 add $2, n 125 js L(top) 126 jmp L(end) 127 128L(b00): mov 8(vp,n,8), %r9 129 mov 16(vp,n,8), %r10 130 jmp L(e00) 131 132 ALIGN(16) 133L(top): mov 16(vp,n,8), %r10 134 mov (vp,n,8), %r8 135 mov 8(vp,n,8), %r9 136 lea (%r11,%r8,M), %r12 137 shr $RSH, %r8 138L(e00): lea (%r8,%r9,M), %r13 139 shr $RSH, %r9 140 mov 24(vp,n,8), %r11 141 lea (%r9,%r10,M), %r14 142 shr $RSH, %r10 143 lea (%r10,%r11,M), %r15 144 shr $RSH, %r11 145 add R32(%rax), R32(%rax) C restore carry 146 ADCSBB (up,n,8), %r12 147 ADCSBB 8(up,n,8), %r13 148 ADCSBB 16(up,n,8), %r14 149 ADCSBB 24(up,n,8), %r15 150 mov %r12, (rp,n,8) 151 mov %r13, 8(rp,n,8) 152 mov %r14, 16(rp,n,8) 153 sbb R32(%rax), R32(%rax) C save carry for next 154 mov %r15, 24(rp,n,8) 155 add $4, n 156 js L(top) 157L(end): 158 159ifelse(ADDSUB,add,` 160 sub R32(%r11), R32(%rax) 161 neg R32(%rax) 162',` 163 add R32(%r11), R32(%rax) 164 movslq R32(%rax), %rax 165') 166 pop %r15 167 pop %r14 168 pop %r13 169 pop %r12 170 FUNC_EXIT() 171 ret 172EPILOGUE() 173