1dnl AMD64 mpn_mod_1_1p 2 3dnl Contributed to the GNU project by Torbj��rn Granlund and Niels M��ller. 4 5dnl Copyright 2009-2012, 2014 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb 36C AMD K8,K9 6 37C AMD K10 6 38C Intel P4 26 39C Intel core2 12.5 40C Intel NHM 11.3 41C Intel SBR 8.4 (slowdown, old code took 8.0) 42C Intel atom 26 43C VIA nano 13 44 45define(`B2mb', `%r10') 46define(`B2modb', `%r11') 47define(`ap', `%rdi') 48define(`n', `%rsi') 49define(`pre', `%r8') 50define(`b', `%rbx') 51 52define(`r0', `%rbp') C r1 kept in %rax 53define(`r2', `%rcx') C kept negated. Also used as shift count 54define(`t0', `%r9') 55 56C mp_limb_t 57C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t bmodb[4]) 58C %rdi %rsi %rdx %rcx 59C The pre array contains bi, cnt, B1modb, B2modb 60C Note: This implementation needs B1modb only when cnt > 0 61 62C The iteration is almost as follows, 63C 64C r_2 B^3 + r_1 B^2 + r_0 B + u = r_1 B2modb + (r_0 + r_2 B2mod) B + u 65C 66C where r2 is a single bit represented as a mask. But to make sure that the 67C result fits in two limbs and a bit, carry from the addition 68C 69C r_0 + r_2 B2mod 70C 71C is handled specially. On carry, we subtract b to cancel the carry, 72C and we use instead the value 73C 74C r_0 + B2mb (mod B) 75C 76C This addition can be issued early since it doesn't depend on r2, and it is 77C the source of the cmov in the loop. 78C 79C We have the invariant that r_2 B^2 + r_1 B + r_0 < B^2 + B b 80 81ABI_SUPPORT(DOS64) 82ABI_SUPPORT(STD64) 83 84ASM_START() 85 TEXT 86 ALIGN(16) 87PROLOGUE(mpn_mod_1_1p) 88 FUNC_ENTRY(4) 89 push %rbp 90 push %rbx 91 mov %rdx, b 92 mov %rcx, pre 93 94 mov -8(ap, n, 8), %rax 95 cmp $3, n 96 jnc L(first) 97 mov -16(ap, n, 8), r0 98 jmp L(reduce_two) 99 100L(first): 101 C First iteration, no r2 102 mov 24(pre), B2modb 103 mul B2modb 104 mov -24(ap, n, 8), r0 105 add %rax, r0 106 mov -16(ap, n, 8), %rax 107 adc %rdx, %rax 108 sbb r2, r2 109 sub $4, n 110 jc L(reduce_three) 111 112 mov B2modb, B2mb 113 sub b, B2mb 114 115 ALIGN(16) 116L(top): and B2modb, r2 117 lea (B2mb, r0), t0 118 mul B2modb 119 add r0, r2 120 mov (ap, n, 8), r0 121 cmovc t0, r2 122 add %rax, r0 123 mov r2, %rax 124 adc %rdx, %rax 125 sbb r2, r2 126 sub $1, n 127 jnc L(top) 128 129L(reduce_three): 130 C Eliminate r2 131 and b, r2 132 sub r2, %rax 133 134L(reduce_two): 135 mov 8(pre), R32(%rcx) 136 test R32(%rcx), R32(%rcx) 137 jz L(normalized) 138 139 C Unnormalized, use B1modb to reduce to size < B (b+1) 140 mulq 16(pre) 141 xor t0, t0 142 add %rax, r0 143 adc %rdx, t0 144 mov t0, %rax 145 146 C Left-shift to normalize 147ifdef(`SHLD_SLOW',` 148 shl R8(%rcx), %rax 149 mov r0, t0 150 neg R32(%rcx) 151 shr R8(%rcx), t0 152 or t0, %rax 153 neg R32(%rcx) 154',` 155 shld R8(%rcx), r0, %rax 156') 157 shl R8(%rcx), r0 158 jmp L(udiv) 159 160L(normalized): 161 mov %rax, t0 162 sub b, t0 163 cmovnc t0, %rax 164 165L(udiv): 166 lea 1(%rax), t0 167 mulq (pre) 168 add r0, %rax 169 adc t0, %rdx 170 imul b, %rdx 171 sub %rdx, r0 172 cmp r0, %rax 173 lea (b, r0), %rax 174 cmovnc r0, %rax 175 cmp b, %rax 176 jnc L(fix) 177L(ok): shr R8(%rcx), %rax 178 179 pop %rbx 180 pop %rbp 181 FUNC_EXIT() 182 ret 183L(fix): sub b, %rax 184 jmp L(ok) 185EPILOGUE() 186 187 ALIGN(16) 188PROLOGUE(mpn_mod_1_1p_cps) 189 FUNC_ENTRY(2) 190 push %rbp 191 bsr %rsi, %rcx 192 push %rbx 193 mov %rdi, %rbx 194 push %r12 195 xor $63, R32(%rcx) 196 mov %rsi, %r12 197 mov R32(%rcx), R32(%rbp) 198 sal R8(%rcx), %r12 199IFSTD(` mov %r12, %rdi ') C pass parameter 200IFDOS(` mov %r12, %rcx ') C pass parameter 201IFDOS(` sub $32, %rsp ') 202 ASSERT(nz, `test $15, %rsp') 203 CALL( mpn_invert_limb) 204IFDOS(` add $32, %rsp ') 205 neg %r12 206 mov %r12, %r8 207 mov %rax, (%rbx) C store bi 208 mov %rbp, 8(%rbx) C store cnt 209 imul %rax, %r12 210 mov %r12, 24(%rbx) C store B2modb 211 mov R32(%rbp), R32(%rcx) 212 test R32(%rcx), R32(%rcx) 213 jz L(z) 214 215 mov $1, R32(%rdx) 216ifdef(`SHLD_SLOW',` 217 C Destroys %rax, unlike shld. Otherwise, we could do B1modb 218 C before B2modb, and get rid of the move %r12, %r8 above. 219 220 shl R8(%rcx), %rdx 221 neg R32(%rcx) 222 shr R8(%rcx), %rax 223 or %rax, %rdx 224 neg R32(%rcx) 225',` 226 shld R8(%rcx), %rax, %rdx 227') 228 imul %rdx, %r8 229 shr R8(%rcx), %r8 230 mov %r8, 16(%rbx) C store B1modb 231L(z): 232 pop %r12 233 pop %rbx 234 pop %rbp 235 FUNC_EXIT() 236 ret 237EPILOGUE() 238ASM_END() 239