1dnl x86-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number. 2 3dnl Copyright 2007, 2008, 2010 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C norm frac 24C K8 20 20 25C P4 73 73 26C P6 core2 37 37 27C P6 corei7 33 33 28 29C TODO 30C * Perhaps compute the inverse without relying on divq? Could either use 31C Newton's method and mulq, or perhaps the faster fdiv. 32C * The loop has not been carefully tuned, nor analysed for critical path 33C length. It seems that 20 c/l is a bit long, compared to the 13 c/l for 34C mpn_divrem_1. 35C * Clean up. This code is really crude. 36 37 38C INPUT PARAMETERS 39define(`qp', `%rdi') 40define(`fn', `%rsi') 41define(`up_param', `%rdx') 42define(`un_param', `%rcx') 43define(`dp', `%r8') 44 45define(`dinv', `%r9') 46 47 48C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15 49C cnt qp d dinv 50 51ASM_START() 52 TEXT 53 ALIGN(16) 54PROLOGUE(mpn_divrem_2) 55 56 push %r15 57 lea (%rdx,%rcx,8), %rax 58 push %r14 59 push %r13 60 mov %rsi, %r13 61 push %r12 62 lea -24(%rax), %r12 63 push %rbp 64 mov %rdi, %rbp 65 push %rbx 66 mov 8(%r8), %r11 67 mov -8(%rax), %r9 68 mov (%r8), %r8 69 mov -16(%rax), %r10 70 xor R32(%r15), R32(%r15) 71 cmp %r9, %r11 72 ja L(2) 73 setb %dl 74 cmp %r10, %r8 75 setbe %al 76 orb %al, %dl 77 jne L(23) 78L(2): 79 lea -3(%rcx,%r13), %rbx C un + fn - 3 80 test %rbx, %rbx 81 js L(6) 82 mov %r11, %rdx 83 mov $-1, %rax 84 not %rdx 85 div %r11 86 mov %r11, %rdx 87 mov %rax, %rdi 88 imul %rax, %rdx 89 mov %rdx, %r14 90 mul %r8 91 mov %rdx, %rcx 92 mov $-1, %rdx 93 add %r8, %r14 94 adc $0, %rdx 95 add %rcx, %r14 96 adc $0, %rdx 97 js L(8) 98L(18): 99 dec %rdi 100 sub %r11, %r14 101 sbb $0, %rdx 102 jns L(18) 103L(8): 104 105C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15 106C n2 un n1 dinv qp d0 d1 up fn msl 107C n2 un -d1 n1 dinv XX XX 108 109ifdef(`NEW',` 110 lea (%rbp,%rbx,8), %rbp 111 mov %rbx, %rcx C un 112 mov %r9, %rbx 113 mov %rdi, %r9 C di 114 mov %r10, %r14 115 mov %r11, %rsi 116 neg %rsi C -d1 117 ALIGN(16) 118L(loop): 119 mov %r9, %rax C di ncp 120 mul %rbx C 0, 18 121 add %r14, %rax C 4 122 mov %rax, %r10 C q0 5 123 adc %rbx, %rdx C 5 124 mov %rdx, %rdi C q 6 125 imul %rsi, %rdx C 6 126 mov %r8, %rax C ncp 127 lea (%rdx, %r14), %rbx C n1 -= ... 7 128 mul %rdi C 7 129 xor R32(%r14), R32(%r14) C 130 cmp %rcx, %r13 C 131 jg L(19) C 132 mov (%r12), %r14 C 133 sub $8, %r12 C 134L(19): sub %r8, %r14 C ncp 135 sbb %r11, %rbx C 9 136 sub %rax, %r14 C 11 137 sbb %rdx, %rbx C 12 138 inc %rdi C 7 139 xor R32(%rdx), R32(%rdx) C 140 cmp %r10, %rbx C 13 141 mov %r8, %rax C d0 ncp 142 adc $-1, %rdx C mask 14 143 add %rdx, %rdi C q-- 15 144 and %rdx, %rax C d0 or 0 15 145 and %r11, %rdx C d1 or 0 15 146 add %rax, %r14 C 16 147 adc %rdx, %rbx C 16 148 cmp %r11, %rbx C 17 149 jae L(fix) C 150L(bck): mov %rdi, (%rbp) C 151 sub $8, %rbp C 152 dec %rcx 153 jns L(loop) 154 155 mov %r14, %r10 156 mov %rbx, %r9 157',` 158 lea (%rbp,%rbx,8), %rbp 159 mov %rbx, %rcx 160 mov %r9, %rax 161 mov %r10, %rsi 162 ALIGN(16) 163L(loop): 164 mov %rax, %r14 C 0, 19 165 mul %rdi C 0 166 mov %r11, %r9 C 1 167 add %rsi, %rax C 4 168 mov %rax, %rbx C q0 5 169 adc %r14, %rdx C q 5 170 lea 1(%rdx), %r10 C 6 171 mov %rdx, %rax C 6 172 imul %rdx, %r9 C 6 173 sub %r9, %rsi C 10 174 xor R32(%r9), R32(%r9) C 175 mul %r8 C 7 176 cmp %rcx, %r13 C 177 jg L(13) C 178 mov (%r12), %r9 C 179 sub $8, %r12 C 180L(13): sub %r8, %r9 C ncp 181 sbb %r11, %rsi C 11 182 sub %rax, %r9 C 11 183 sbb %rdx, %rsi C 12 184 cmp %rbx, %rsi C 13 185 sbb %rax, %rax C 14 186 not %rax C 15 187 add %rax, %r10 C 16 188 mov %r8, %rbx C ncp 189 and %rax, %rbx C 16 190 and %r11, %rax C 16 191 add %rbx, %r9 C 17 192 adc %rsi, %rax C 18 193 cmp %rax, %r11 C 19 194 jbe L(fix) C 195L(bck): mov %r10, (%rbp) C 196 sub $8, %rbp C 197 mov %r9, %rsi C 18 198 dec %rcx 199 jns L(loop) 200 201 mov %rsi, %r10 202 mov %rax, %r9 203') 204L(6): 205 mov %r10, 8(%r12) 206 mov %r9, 16(%r12) 207 pop %rbx 208 pop %rbp 209 pop %r12 210 pop %r13 211 pop %r14 212 mov %r15, %rax 213 pop %r15 214 ret 215 216L(23): inc R32(%r15) 217 sub %r8, %r10 218 sbb %r11, %r9 219 jmp L(2) 220 221ifdef(`NEW',` 222L(fix): seta %dl 223 cmp %r8, %r14 224 setae %al 225 orb %dl, %al 226 je L(bck) 227 inc %rdi 228 sub %r8, %r14 229 sbb %r11, %rbx 230 jmp L(bck) 231',` 232L(fix): jb L(88) 233 cmp %r8, %r9 234 jb L(bck) 235L(88): inc %r10 236 sub %r8, %r9 237 sbb %r11, %rax 238 jmp L(bck) 239') 240EPILOGUE() 241