redc_1.asm revision 1.1.1.1
1dnl AMD64 mpn_redc_1 -- Montgomery reduction with a one-limb modular inverse. 2 3dnl Copyright 2004, 2008 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C cycles/limb 24C cycles/limb 25C K8,K9: 2.5 26C K10: 2.5 27C P4: ? 28C P6-15 (Core2): 5.3 29C P6-28 (Atom): ? 30 31C TODO 32C * Handle certain sizes, e.g., 1, 2, 3, 4, 8, with single-loop code. 33C The code for 1, 2, 3, 4 should perhaps be completely register based. 34C * Perhaps align outer loops. 35C * The sub_n at the end leaks side-channel data. How do we fix that? 36C * Write mpn_add_n_sub_n computing R = A + B - C. It should run at 2 c/l. 37C * We could software pipeline the IMUL stuff, by putting it before the 38C outer loops and before the end of the outer loops. The last outer 39C loop iteration would then compute an unneeded product, but it is at 40C least not a stray read from up[], since it is at up[n]. 41C * Can we combine both the add_n and sub_n into the loops, somehow? 42 43C INPUT PARAMETERS 44define(`rp', `%rdi') 45define(`up', `%rsi') 46define(`param_mp',`%rdx') 47define(`n', `%rcx') 48define(`invm', `%r8') 49 50define(`mp', `%r13') 51define(`i', `%r11') 52define(`nneg', `%r12') 53 54ASM_START() 55 TEXT 56 ALIGN(32) 57PROLOGUE(mpn_redc_1) 58 push %rbp 59 push %rbx 60 push %r12 61 push %r13 62 push %r14 63 push n 64 sub $8, %rsp C maintain ABI required rsp alignment 65 66 lea (param_mp,n,8), mp C mp += n 67 lea (up,n,8), up C up += n 68 69 mov n, nneg 70 neg nneg 71 72 mov R32(n), R32(%rax) 73 and $3, R32(%rax) 74 jz L(b0) 75 cmp $2, R32(%rax) 76 jz L(b2) 77 jg L(b3) 78 79L(b1): C lea (mp), mp 80 lea -16(up), up 81L(o1): mov nneg, i 82 mov 16(up,nneg,8), %rbp C up[0] 83 imul invm, %rbp 84 85 mov (mp,i,8), %rax 86 xor %ebx, %ebx 87 mul %rbp 88 add $1, i 89 jnz 1f 90 add %rax, 8(up,i,8) 91 adc $0, %rdx 92 mov %rdx, %r14 93 jmp L(n1) 94 951: mov %rax, %r9 96 mov (mp,i,8), %rax 97 mov %rdx, %r14 98 jmp L(mi1) 99 100 ALIGN(16) 101L(lo1): add %r10, (up,i,8) 102 adc %rax, %r9 103 mov (mp,i,8), %rax 104 adc %rdx, %r14 105L(mi1): xor %r10d, %r10d 106 mul %rbp 107 add %r9, 8(up,i,8) 108 adc %rax, %r14 109 adc %rdx, %rbx 110 mov 8(mp,i,8), %rax 111 mul %rbp 112 add %r14, 16(up,i,8) 113 adc %rax, %rbx 114 adc %rdx, %r10 115 mov 16(mp,i,8), %rax 116 mul %rbp 117 xor %r9d, %r9d 118 xor %r14d, %r14d 119 add %rbx, 24(up,i,8) 120 adc %rax, %r10 121 mov 24(mp,i,8), %rax 122 adc %rdx, %r9 123 xor %ebx, %ebx 124 mul %rbp 125 add $4, i 126 js L(lo1) 127L(ed1): add %r10, (up) 128 adc %rax, %r9 129 adc %rdx, %r14 130 xor %r10d, %r10d 131 add %r9, 8(up) 132 adc $0, %r14 133L(n1): mov %r14, 16(up,nneg,8) C up[0] 134 add $8, up 135 dec n 136 jnz L(o1) 137C lea (mp), mp 138 lea 16(up), up 139 jmp L(common) 140 141L(b0): C lea (mp), mp 142 lea -16(up), up 143L(o0): mov nneg, i 144 mov 16(up,nneg,8), %rbp C up[0] 145 imul invm, %rbp 146 147 mov (mp,i,8), %rax 148 xor %r10d, %r10d 149 mul %rbp 150 mov %rax, %r14 151 mov %rdx, %rbx 152 jmp L(mi0) 153 154 ALIGN(16) 155L(lo0): add %r10, (up,i,8) 156 adc %rax, %r9 157 mov (mp,i,8), %rax 158 adc %rdx, %r14 159 xor %r10d, %r10d 160 mul %rbp 161 add %r9, 8(up,i,8) 162 adc %rax, %r14 163 adc %rdx, %rbx 164L(mi0): mov 8(mp,i,8), %rax 165 mul %rbp 166 add %r14, 16(up,i,8) 167 adc %rax, %rbx 168 adc %rdx, %r10 169 mov 16(mp,i,8), %rax 170 mul %rbp 171 xor %r9d, %r9d 172 xor %r14d, %r14d 173 add %rbx, 24(up,i,8) 174 adc %rax, %r10 175 mov 24(mp,i,8), %rax 176 adc %rdx, %r9 177 xor %ebx, %ebx 178 mul %rbp 179 add $4, i 180 js L(lo0) 181L(ed0): add %r10, (up) 182 adc %rax, %r9 183 adc %rdx, %r14 184 xor %r10d, %r10d 185 add %r9, 8(up) 186 adc $0, %r14 187 mov %r14, 16(up,nneg,8) C up[0] 188 add $8, up 189 dec n 190 jnz L(o0) 191C lea (mp), mp 192 lea 16(up), up 193 jmp L(common) 194 195 196L(b3): lea -8(mp), mp 197 lea -24(up), up 198L(o3): mov nneg, i 199 mov 24(up,nneg,8), %rbp C up[0] 200 imul invm, %rbp 201 202 mov 8(mp,i,8), %rax 203 mul %rbp 204 mov %rax, %rbx 205 mov %rdx, %r10 206 jmp L(mi3) 207 208 ALIGN(16) 209L(lo3): add %r10, (up,i,8) 210 adc %rax, %r9 211 mov (mp,i,8), %rax 212 adc %rdx, %r14 213 xor %r10d, %r10d 214 mul %rbp 215 add %r9, 8(up,i,8) 216 adc %rax, %r14 217 adc %rdx, %rbx 218 mov 8(mp,i,8), %rax 219 mul %rbp 220 add %r14, 16(up,i,8) 221 adc %rax, %rbx 222 adc %rdx, %r10 223L(mi3): mov 16(mp,i,8), %rax 224 mul %rbp 225 xor %r9d, %r9d 226 xor %r14d, %r14d 227 add %rbx, 24(up,i,8) 228 adc %rax, %r10 229 mov 24(mp,i,8), %rax 230 adc %rdx, %r9 231 xor %ebx, %ebx 232 mul %rbp 233 add $4, i 234 js L(lo3) 235L(ed3): add %r10, 8(up) 236 adc %rax, %r9 237 adc %rdx, %r14 238 xor %r10d, %r10d 239 add %r9, 16(up) 240 adc $0, %r14 241 mov %r14, 24(up,nneg,8) C up[0] 242 add $8, up 243 dec n 244 jnz L(o3) 245 lea 8(mp), mp 246 lea 24(up), up 247 jmp L(common) 248 249L(b2): lea -16(mp), mp 250 lea -32(up), up 251L(o2): mov nneg, i 252 mov 32(up,nneg,8), %rbp C up[0] 253 imul invm, %rbp 254 255 mov 16(mp,i,8), %rax 256 mul %rbp 257 xor %r14d, %r14d 258 mov %rax, %r10 259 mov 24(mp,i,8), %rax 260 mov %rdx, %r9 261 jmp L(mi2) 262 263 ALIGN(16) 264L(lo2): add %r10, (up,i,8) 265 adc %rax, %r9 266 mov (mp,i,8), %rax 267 adc %rdx, %r14 268 xor %r10d, %r10d 269 mul %rbp 270 add %r9, 8(up,i,8) 271 adc %rax, %r14 272 adc %rdx, %rbx 273 mov 8(mp,i,8), %rax 274 mul %rbp 275 add %r14, 16(up,i,8) 276 adc %rax, %rbx 277 adc %rdx, %r10 278 mov 16(mp,i,8), %rax 279 mul %rbp 280 xor %r9d, %r9d 281 xor %r14d, %r14d 282 add %rbx, 24(up,i,8) 283 adc %rax, %r10 284 mov 24(mp,i,8), %rax 285 adc %rdx, %r9 286L(mi2): xor %ebx, %ebx 287 mul %rbp 288 add $4, i 289 js L(lo2) 290L(ed2): add %r10, 16(up) 291 adc %rax, %r9 292 adc %rdx, %r14 293 xor %r10d, %r10d 294 add %r9, 24(up) 295 adc $0, %r14 296 mov %r14, 32(up,nneg,8) C up[0] 297 add $8, up 298 dec n 299 jnz L(o2) 300 lea 16(mp), mp 301 lea 32(up), up 302 303 304L(common): 305 lea (mp,nneg,8), mp C restore entry mp 306 307C cy = mpn_add_n (rp, up, up - n, n); 308C rdi rsi rdx rcx 309 lea (up,nneg,8), up C up -= n 310 lea (up,nneg,8), %rdx C rdx = up - n [up entry value] 311 mov rp, nneg C preserve rp over first call 312 mov 8(%rsp), %rcx C pass entry n 313C mov rp, %rdi 314 CALL( mpn_add_n) 315 test R32(%rax), R32(%rax) 316 jz L(ret) 317 318C mpn_sub_n (rp, rp, mp, n); 319C rdi rsi rdx rcx 320 mov nneg, %rdi 321 mov nneg, %rsi 322 mov mp, %rdx 323 mov 8(%rsp), %rcx C pass entry n 324 CALL( mpn_sub_n) 325 326L(ret): 327 add $8, %rsp 328 pop n C just increment rsp 329 pop %r14 330 pop %r13 331 pop %r12 332 pop %rbx 333 pop %rbp 334 ret 335EPILOGUE() 336