1dnl Alpha mpn_bdiv_dbm1c. 2 3dnl Copyright 2008 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22C cycles/limb 23C EV4: 42 24C EV5: 18 25C EV6: 3 26 27C TODO 28C * Try less unrolling, 2-way should give the same performance. 29C * Optimize feed-in and wind-down code, for speed, and perhaps further for 30C code size. 31C * This runs optimally given the algorithm, r8 is on a 3 operation recurrency 32C path. We have not tried very hard to find a better algorithm. Perhaps 33C it would be a good task for the GNU superoptimizer. 34 35C INPUT PARAMETERS 36define(`rp', `r16') 37define(`up', `r17') 38define(`n', `r18') 39define(`bd', `r19') 40define(`cy', `r19') 41 42 43ASM_START() 44PROLOGUE(mpn_bdiv_dbm1c) 45 mov r20, r8 46 47 ldq r24, 0(r17) 48 and r18, 3, r28 49 lda r18, -4(r18) 50 beq r28, L(b0) 51 cmpeq r28, 1, r21 52 bne r21, L(b1) 53 cmpeq r28, 2, r21 54 bne r21, L(b2) 55 56 57L(b3): ldq r2, 8(r17) 58 ldq r3, 16(r17) 59 bgt r18, L(gt3) 60 61 mulq r24, r19, r5 C U1 62 umulh r24, r19, r21 C U1 63 mulq r2, r19, r6 C U1 64 umulh r2, r19, r22 C U1 65 mulq r3, r19, r7 C U1 66 umulh r3, r19, r23 C U1 67 lda r16, -32(r16) 68 br L(cj3) 69 70L(gt3): ldq r0, 24(r17) 71 mulq r24, r19, r5 C U1 72 umulh r24, r19, r21 C U1 73 ldq r1, 32(r17) 74 mulq r2, r19, r6 C U1 75 umulh r2, r19, r22 C U1 76 ldq r2, 40(r17) 77 mulq r3, r19, r7 C U1 78 umulh r3, r19, r23 C U1 79 ldq r3, 48(r17) 80 lda r18, -4(r18) 81 lda r17, 56(r17) 82 mulq r0, r19, r4 C U1 83 bgt r18, L(L3) 84 85 br L(cj7) 86 87 88L(b2): ldq r3, 8(r17) 89 bgt r18, L(gt2) 90 91 mulq r24, r19, r6 C U1 92 umulh r24, r19, r22 C U1 93 mulq r3, r19, r7 C U1 94 umulh r3, r19, r23 C U1 95 lda r16, -40(r16) 96 br L(cj2) 97 98L(gt2): ldq r0, 16(r17) 99 ldq r1, 24(r17) 100 mulq r24, r19, r6 C U1 101 umulh r24, r19, r22 C U1 102 ldq r2, 32(r17) 103 mulq r3, r19, r7 C U1 104 umulh r3, r19, r23 C U1 105 ldq r3, 40(r17) 106 lda r18, -4(r18) 107 lda r17, 48(r17) 108 mulq r0, r19, r4 C U1 109 umulh r0, r19, r20 C U1 110 lda r16, -8(r16) 111 bgt r18, L(gt6) 112 113 mulq r1, r19, r5 C U1 114 br L(cj6) 115 116L(gt6): ldq r0, 0(r17) 117 mulq r1, r19, r5 C U1 118 br L(L2) 119 120 121L(b1): bgt r18, L(gt1) 122 123 mulq r24, r19, r7 C U1 124 umulh r24, r19, r23 C U1 125 lda r16, -48(r16) 126 br L(cj1) 127 128L(gt1): ldq r0, 8(r17) 129 ldq r1, 16(r17) 130 ldq r2, 24(r17) 131 mulq r24, r19, r7 C U1 132 umulh r24, r19, r23 C U1 133 ldq r3, 32(r17) 134 lda r18, -4(r18) 135 lda r17, 40(r17) 136 mulq r0, r19, r4 C U1 137 umulh r0, r19, r20 C U1 138 lda r16, -16(r16) 139 bgt r18, L(gt5) 140 141 mulq r1, r19, r5 C U1 142 umulh r1, r19, r21 C U1 143 mulq r2, r19, r6 C U1 144 br L(cj5) 145 146L(gt5): ldq r0, 0(r17) 147 mulq r1, r19, r5 C U1 148 umulh r1, r19, r21 C U1 149 ldq r1, 8(r17) 150 mulq r2, r19, r6 C U1 151 br L(L1) 152 153 154L(b0): ldq r1, 8(r17) 155 ldq r2, 16(r17) 156 ldq r3, 24(r17) 157 lda r17, 32(r17) 158 lda r16, -24(r16) 159 mulq r24, r19, r4 C U1 160 umulh r24, r19, r20 C U1 161 bgt r18, L(gt4) 162 163 mulq r1, r19, r5 C U1 164 umulh r1, r19, r21 C U1 165 mulq r2, r19, r6 C U1 166 umulh r2, r19, r22 C U1 167 mulq r3, r19, r7 C U1 168 br L(cj4) 169 170L(gt4): ldq r0, 0(r17) 171 mulq r1, r19, r5 C U1 172 umulh r1, r19, r21 C U1 173 ldq r1, 8(r17) 174 mulq r2, r19, r6 C U1 175 umulh r2, r19, r22 C U1 176 ldq r2, 16(r17) 177 mulq r3, r19, r7 C U1 178 br L(L0) 179 180C *** MAIN LOOP START *** 181 ALIGN(16) 182L(top): mulq r0, r19, r4 C U1 183 subq r8, r28, r8 184L(L3): umulh r0, r19, r20 C U1 185 cmpult r8, r5, r28 186 ldq r0, 0(r17) 187 subq r8, r5, r8 188 addq r21, r28, r28 189 stq r8, 0(r16) 190 191 mulq r1, r19, r5 C U1 192 subq r8, r28, r8 193L(L2): umulh r1, r19, r21 C U1 194 cmpult r8, r6, r28 195 ldq r1, 8(r17) 196 subq r8, r6, r8 197 addq r22, r28, r28 198 stq r8, 8(r16) 199 200 mulq r2, r19, r6 C U1 201 subq r8, r28, r8 202L(L1): umulh r2, r19, r22 C U1 203 cmpult r8, r7, r28 204 ldq r2, 16(r17) 205 subq r8, r7, r8 206 addq r23, r28, r28 207 stq r8, 16(r16) 208 209 mulq r3, r19, r7 C U1 210 subq r8, r28, r8 211L(L0): umulh r3, r19, r23 C U1 212 cmpult r8, r4, r28 213 ldq r3, 24(r17) 214 subq r8, r4, r8 215 addq r20, r28, r28 216 stq r8, 24(r16) 217 218 lda r18, -4(r18) 219 lda r17, 32(r17) 220 lda r16, 32(r16) 221 bgt r18, L(top) 222C *** MAIN LOOP END *** 223 224 mulq r0, r19, r4 C U1 225 subq r8, r28, r8 226L(cj7): umulh r0, r19, r20 C U1 227 cmpult r8, r5, r28 228 subq r8, r5, r8 229 addq r21, r28, r28 230 stq r8, 0(r16) 231 mulq r1, r19, r5 C U1 232 subq r8, r28, r8 233L(cj6): umulh r1, r19, r21 C U1 234 cmpult r8, r6, r28 235 subq r8, r6, r8 236 addq r22, r28, r28 237 stq r8, 8(r16) 238 mulq r2, r19, r6 C U1 239 subq r8, r28, r8 240L(cj5): umulh r2, r19, r22 C U1 241 cmpult r8, r7, r28 242 subq r8, r7, r8 243 addq r23, r28, r28 244 stq r8, 16(r16) 245 mulq r3, r19, r7 C U1 246 subq r8, r28, r8 247L(cj4): umulh r3, r19, r23 C U1 248 cmpult r8, r4, r28 249 subq r8, r4, r8 250 addq r20, r28, r28 251 stq r8, 24(r16) 252 subq r8, r28, r8 253L(cj3): cmpult r8, r5, r28 254 subq r8, r5, r8 255 addq r21, r28, r28 256 stq r8, 32(r16) 257 subq r8, r28, r8 258L(cj2): cmpult r8, r6, r28 259 subq r8, r6, r8 260 addq r22, r28, r28 261 stq r8, 40(r16) 262 subq r8, r28, r8 263L(cj1): cmpult r8, r7, r28 264 subq r8, r7, r8 265 addq r23, r28, r28 266 stq r8, 48(r16) 267 subq r8, r28, r0 268 ret r31, (r26), 1 269 270EPILOGUE() 271ASM_END() 272