aors_err1_n.asm revision 1.1.1.2
1dnl AMD64 mpn_add_err1_n, mpn_sub_err1_n 2 3dnl Contributed by David Harvey. 4 5dnl Copyright 2011 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb 36C AMD K8,K9 2.75 (degenerates to 3 c/l for some alignments) 37C AMD K10 ? 38C Intel P4 ? 39C Intel core2 ? 40C Intel corei ? 41C Intel atom ? 42C VIA nano ? 43 44 45C INPUT PARAMETERS 46define(`rp', `%rdi') 47define(`up', `%rsi') 48define(`vp', `%rdx') 49define(`ep', `%rcx') 50define(`yp', `%r8') 51define(`n', `%r9') 52define(`cy_param', `8(%rsp)') 53 54define(`el', `%rbx') 55define(`eh', `%rbp') 56define(`t0', `%r10') 57define(`t1', `%r11') 58define(`t2', `%r12') 59define(`t3', `%r13') 60define(`w0', `%r14') 61define(`w1', `%r15') 62 63ifdef(`OPERATION_add_err1_n', ` 64 define(ADCSBB, adc) 65 define(func, mpn_add_err1_n)') 66ifdef(`OPERATION_sub_err1_n', ` 67 define(ADCSBB, sbb) 68 define(func, mpn_sub_err1_n)') 69 70MULFUNC_PROLOGUE(mpn_add_err1_n mpn_sub_err1_n) 71 72 73ASM_START() 74 TEXT 75 ALIGN(16) 76PROLOGUE(func) 77 mov cy_param, %rax 78 79 push %rbx 80 push %rbp 81 push %r12 82 push %r13 83 push %r14 84 push %r15 85 86 lea (up,n,8), up 87 lea (vp,n,8), vp 88 lea (rp,n,8), rp 89 90 mov R32(n), R32(%r10) 91 and $3, R32(%r10) 92 jz L(0mod4) 93 cmp $2, R32(%r10) 94 jc L(1mod4) 95 jz L(2mod4) 96L(3mod4): 97 xor R32(el), R32(el) 98 xor R32(eh), R32(eh) 99 xor R32(t0), R32(t0) 100 xor R32(t1), R32(t1) 101 lea -24(yp,n,8), yp 102 neg n 103 104 shr $1, %al C restore carry 105 mov (up,n,8), w0 106 mov 8(up,n,8), w1 107 ADCSBB (vp,n,8), w0 108 mov w0, (rp,n,8) 109 cmovc 16(yp), el 110 ADCSBB 8(vp,n,8), w1 111 mov w1, 8(rp,n,8) 112 cmovc 8(yp), t0 113 mov 16(up,n,8), w0 114 ADCSBB 16(vp,n,8), w0 115 mov w0, 16(rp,n,8) 116 cmovc (yp), t1 117 setc %al C save carry 118 add t0, el 119 adc $0, eh 120 add t1, el 121 adc $0, eh 122 123 add $3, n 124 jnz L(loop) 125 jmp L(end) 126 127 ALIGN(16) 128L(0mod4): 129 xor R32(el), R32(el) 130 xor R32(eh), R32(eh) 131 lea (yp,n,8), yp 132 neg n 133 jmp L(loop) 134 135 ALIGN(16) 136L(1mod4): 137 xor R32(el), R32(el) 138 xor R32(eh), R32(eh) 139 lea -8(yp,n,8), yp 140 neg n 141 142 shr $1, %al C restore carry 143 mov (up,n,8), w0 144 ADCSBB (vp,n,8), w0 145 mov w0, (rp,n,8) 146 cmovc (yp), el 147 setc %al C save carry 148 149 add $1, n 150 jnz L(loop) 151 jmp L(end) 152 153 ALIGN(16) 154L(2mod4): 155 xor R32(el), R32(el) 156 xor R32(eh), R32(eh) 157 xor R32(t0), R32(t0) 158 lea -16(yp,n,8), yp 159 neg n 160 161 shr $1, %al C restore carry 162 mov (up,n,8), w0 163 mov 8(up,n,8), w1 164 ADCSBB (vp,n,8), w0 165 mov w0, (rp,n,8) 166 cmovc 8(yp), el 167 ADCSBB 8(vp,n,8), w1 168 mov w1, 8(rp,n,8) 169 cmovc (yp), t0 170 setc %al C save carry 171 add t0, el 172 adc $0, eh 173 174 add $2, n 175 jnz L(loop) 176 jmp L(end) 177 178 ALIGN(32) 179L(loop): 180 shr $1, %al C restore carry 181 mov -8(yp), t0 182 mov $0, R32(t3) 183 mov (up,n,8), w0 184 mov 8(up,n,8), w1 185 ADCSBB (vp,n,8), w0 186 cmovnc t3, t0 187 ADCSBB 8(vp,n,8), w1 188 mov -16(yp), t1 189 mov w0, (rp,n,8) 190 mov 16(up,n,8), w0 191 mov w1, 8(rp,n,8) 192 cmovnc t3, t1 193 mov -24(yp), t2 194 ADCSBB 16(vp,n,8), w0 195 cmovnc t3, t2 196 mov 24(up,n,8), w1 197 ADCSBB 24(vp,n,8), w1 198 cmovc -32(yp), t3 199 setc %al C save carry 200 add t0, el 201 adc $0, eh 202 add t1, el 203 adc $0, eh 204 add t2, el 205 adc $0, eh 206 mov w0, 16(rp,n,8) 207 add t3, el 208 lea -32(yp), yp 209 adc $0, eh 210 mov w1, 24(rp,n,8) 211 add $4, n 212 jnz L(loop) 213 214L(end): 215 mov el, (ep) 216 mov eh, 8(ep) 217 218 pop %r15 219 pop %r14 220 pop %r13 221 pop %r12 222 pop %rbp 223 pop %rbx 224 ret 225EPILOGUE() 226