aors_err1_n.asm revision 1.1.1.1
1dnl AMD64 mpn_add_err1_n, mpn_sub_err1_n 2 3dnl Contributed by David Harvey. 4 5dnl Copyright 2011 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C cycles/limb 25C AMD K8,K9 2.75 (most alignments, degenerates to 3 c/l for some aligments) 26C AMD K10 ? 27C Intel P4 ? 28C Intel core2 ? 29C Intel corei ? 30C Intel atom ? 31C VIA nano ? 32 33 34C INPUT PARAMETERS 35define(`rp', `%rdi') 36define(`up', `%rsi') 37define(`vp', `%rdx') 38define(`ep', `%rcx') 39define(`yp', `%r8') 40define(`n', `%r9') 41define(`cy_param', `8(%rsp)') 42 43define(`el', `%rbx') 44define(`eh', `%rbp') 45define(`t0', `%r10') 46define(`t1', `%r11') 47define(`t2', `%r12') 48define(`t3', `%r13') 49define(`w0', `%r14') 50define(`w1', `%r15') 51 52ifdef(`OPERATION_add_err1_n', ` 53 define(ADCSBB, adc) 54 define(func, mpn_add_err1_n)') 55ifdef(`OPERATION_sub_err1_n', ` 56 define(ADCSBB, sbb) 57 define(func, mpn_sub_err1_n)') 58 59MULFUNC_PROLOGUE(mpn_add_err1_n mpn_sub_err1_n) 60 61 62ASM_START() 63 TEXT 64 ALIGN(16) 65PROLOGUE(func) 66 mov cy_param, %rax 67 68 push %rbx 69 push %rbp 70 push %r12 71 push %r13 72 push %r14 73 push %r15 74 75 lea (up,n,8), up 76 lea (vp,n,8), vp 77 lea (rp,n,8), rp 78 79 mov R32(n), R32(%r10) 80 and $3, R32(%r10) 81 jz L(0mod4) 82 cmp $2, R32(%r10) 83 jc L(1mod4) 84 jz L(2mod4) 85L(3mod4): 86 xor R32(el), R32(el) 87 xor R32(eh), R32(eh) 88 xor R32(t0), R32(t0) 89 xor R32(t1), R32(t1) 90 lea -24(yp,n,8), yp 91 neg n 92 93 shr $1, %al C restore carry 94 mov (up,n,8), w0 95 mov 8(up,n,8), w1 96 ADCSBB (vp,n,8), w0 97 mov w0, (rp,n,8) 98 cmovc 16(yp), el 99 ADCSBB 8(vp,n,8), w1 100 mov w1, 8(rp,n,8) 101 cmovc 8(yp), t0 102 mov 16(up,n,8), w0 103 ADCSBB 16(vp,n,8), w0 104 mov w0, 16(rp,n,8) 105 cmovc (yp), t1 106 setc %al C save carry 107 add t0, el 108 adc $0, eh 109 add t1, el 110 adc $0, eh 111 112 add $3, n 113 jnz L(loop) 114 jmp L(end) 115 116 ALIGN(16) 117L(0mod4): 118 xor R32(el), R32(el) 119 xor R32(eh), R32(eh) 120 lea (yp,n,8), yp 121 neg n 122 jmp L(loop) 123 124 ALIGN(16) 125L(1mod4): 126 xor R32(el), R32(el) 127 xor R32(eh), R32(eh) 128 lea -8(yp,n,8), yp 129 neg n 130 131 shr $1, %al C restore carry 132 mov (up,n,8), w0 133 ADCSBB (vp,n,8), w0 134 mov w0, (rp,n,8) 135 cmovc (yp), el 136 setc %al C save carry 137 138 add $1, n 139 jnz L(loop) 140 jmp L(end) 141 142 ALIGN(16) 143L(2mod4): 144 xor R32(el), R32(el) 145 xor R32(eh), R32(eh) 146 xor R32(t0), R32(t0) 147 lea -16(yp,n,8), yp 148 neg n 149 150 shr $1, %al C restore carry 151 mov (up,n,8), w0 152 mov 8(up,n,8), w1 153 ADCSBB (vp,n,8), w0 154 mov w0, (rp,n,8) 155 cmovc 8(yp), el 156 ADCSBB 8(vp,n,8), w1 157 mov w1, 8(rp,n,8) 158 cmovc (yp), t0 159 setc %al C save carry 160 add t0, el 161 adc $0, eh 162 163 add $2, n 164 jnz L(loop) 165 jmp L(end) 166 167 ALIGN(32) 168L(loop): 169 shr $1, %al C restore carry 170 mov -8(yp), t0 171 mov $0, R32(t3) 172 mov (up,n,8), w0 173 mov 8(up,n,8), w1 174 ADCSBB (vp,n,8), w0 175 cmovnc t3, t0 176 ADCSBB 8(vp,n,8), w1 177 mov -16(yp), t1 178 mov w0, (rp,n,8) 179 mov 16(up,n,8), w0 180 mov w1, 8(rp,n,8) 181 cmovnc t3, t1 182 mov -24(yp), t2 183 ADCSBB 16(vp,n,8), w0 184 cmovnc t3, t2 185 mov 24(up,n,8), w1 186 ADCSBB 24(vp,n,8), w1 187 cmovc -32(yp), t3 188 setc %al C save carry 189 add t0, el 190 adc $0, eh 191 add t1, el 192 adc $0, eh 193 add t2, el 194 adc $0, eh 195 mov w0, 16(rp,n,8) 196 add t3, el 197 lea -32(yp), yp 198 adc $0, eh 199 mov w1, 24(rp,n,8) 200 add $4, n 201 jnz L(loop) 202 203L(end): 204 mov el, (ep) 205 mov eh, 8(ep) 206 207 pop %r15 208 pop %r14 209 pop %r13 210 pop %r12 211 pop %rbp 212 pop %rbx 213 ret 214EPILOGUE() 215