diveby3.asm revision 1.1.1.1
1dnl Alpha mpn_divexact_by3c -- mpn division by 3, expecting no remainder. 2 3dnl Copyright 2004, 2005, 2009 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22C cycles/limb 23C EV4: 22 24C EV5: 11.5 25C EV6: 6.3 Note that mpn_bdiv_dbm1c is faster 26 27C TODO 28C * Remove the unops, they benefit just ev6, which no longer uses this file. 29C * Try prefetch for destination, using lds. 30C * Improve feed-in code, by moving initial mulq earlier; make initial load 31C to u0/u0 to save some copying. 32C * Combine u0 and u2, u1 and u3. 33 34C INPUT PARAMETERS 35define(`rp', `r16') 36define(`up', `r17') 37define(`n', `r18') 38define(`cy', `r19') 39 40ASM_START() 41 42DATASTART(L(LC)) 43 .quad 0xAAAAAAAAAAAAAAAB 44 .quad 0x5555555555555555 45 .quad 0xAAAAAAAAAAAAAAAA 46DATAEND() 47 48define(`xAAAAAAAAAAAAAAAB', `r20') 49define(`x5555555555555555', `r21') 50define(`xAAAAAAAAAAAAAAAA', `r22') 51define(`u0', `r0') define(`u1', `r1') 52define(`u2', `r2') define(`u3', `r3') 53define(`l0', `r25') define(`x', `r8') 54define(`q0', `r4') define(`q1', `r5') 55define(`p6', `r6') define(`p7', `r7') 56define(`t0', `r23') define(`t1', `r24') 57define(`cymask',`r28') 58 59 60PROLOGUE(mpn_divexact_by3c,gp) 61 62 ldq r28, 0(up) C load first limb early 63 64C Put magic constants in registers 65 lda r0, L(LC) 66 ldq xAAAAAAAAAAAAAAAB, 0(r0) 67 ldq x5555555555555555, 8(r0) 68 ldq xAAAAAAAAAAAAAAAA, 16(r0) 69 70C Compute initial l0 value 71 cmpeq cy, 1, p6 72 cmpeq cy, 2, p7 73 negq p6, p6 74 and p6, x5555555555555555, l0 75 cmovne p7, xAAAAAAAAAAAAAAAA, l0 76 77C Feed-in depending on (n mod 4) 78 and n, 3, r8 79 lda n, -3(n) 80 cmpeq r8, 1, r4 81 cmpeq r8, 2, r5 82 bne r4, $Lb01 83 bne r5, $Lb10 84 beq r8, $Lb00 85 86$Lb11: ldq u3, 8(up) 87 lda up, -24(up) 88 lda rp, -24(rp) 89 mulq r28, xAAAAAAAAAAAAAAAB, q0 90 mov r28, u2 91 br r31, $L11 92 93$Lb00: ldq u2, 8(up) 94 lda up, -16(up) 95 lda rp, -16(rp) 96 mulq r28, xAAAAAAAAAAAAAAAB, q1 97 mov r28, u1 98 br r31, $L00 99 100$Lb01: lda rp, -8(rp) 101 mulq r28, xAAAAAAAAAAAAAAAB, q0 102 mov r28, u0 103 blt n, $Lcj1 104 ldq u1, 8(up) 105 lda up, -8(up) 106 br r31, $L01 107 108$Lb10: ldq u0, 8(up) 109 mulq r28, xAAAAAAAAAAAAAAAB, q1 110 mov r28, u3 111 blt n, $Lend 112 113 ALIGN(16) 114$Ltop: 115C 0 116 cmpult u3, cy, cy C L0 117 mulq u0, xAAAAAAAAAAAAAAAB, q0 C U1 118 ldq u1, 16(up) C L1 119 addq q1, l0, x C U0 120C 1 121 negq cy, cymask C L0 122 unop C U1 123 unop C L1 124 cmpult x5555555555555555, x, p6 C U0 125C 2 126 cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1 127 unop 128 unop 129 negq p6, t0 C L0 130C 3 131 negq p7, t1 C L0 132 and cymask, x5555555555555555, l0 C U1 133 addq p6, cy, cy 134 and t0, x5555555555555555, t0 135C 4 136 and t1, x5555555555555555, t1 137 addq p7, cy, cy 138 unop 139 addq t0, l0, l0 140C 5 141 addq t1, l0, l0 142 unop 143 stq x, 0(rp) C L1 144 unop 145$L01: 146C 0 147 cmpult u0, cy, cy C L0 148 mulq u1, xAAAAAAAAAAAAAAAB, q1 C U1 149 ldq u2, 24(up) C L1 150 addq q0, l0, x C U0 151C 1 152 negq cy, cymask C L0 153 unop C U1 154 unop C L1 155 cmpult x5555555555555555, x, p6 C U0 156C 2 157 cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1 158 unop 159 unop 160 negq p6, t0 C L0 161C 3 162 negq p7, t1 C L0 163 and cymask, x5555555555555555, l0 C U1 164 addq p6, cy, cy 165 and t0, x5555555555555555, t0 166C 4 167 and t1, x5555555555555555, t1 168 addq p7, cy, cy 169 unop 170 addq t0, l0, l0 171C 5 172 addq t1, l0, l0 173 unop 174 stq x, 8(rp) C L1 175 unop 176$L00: 177C 0 178 cmpult u1, cy, cy C L0 179 mulq u2, xAAAAAAAAAAAAAAAB, q0 C U1 180 ldq u3, 32(up) C L1 181 addq q1, l0, x C U0 182C 1 183 negq cy, cymask C L0 184 unop C U1 185 unop C L1 186 cmpult x5555555555555555, x, p6 C U0 187C 2 188 cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1 189 unop 190 unop 191 negq p6, t0 C L0 192C 3 193 negq p7, t1 C L0 194 and cymask, x5555555555555555, l0 C U1 195 addq p6, cy, cy 196 and t0, x5555555555555555, t0 197C 4 198 and t1, x5555555555555555, t1 199 addq p7, cy, cy 200 unop 201 addq t0, l0, l0 202C 5 203 addq t1, l0, l0 204 unop 205 stq x, 16(rp) C L1 206 unop 207$L11: 208C 0 209 cmpult u2, cy, cy C L0 210 mulq u3, xAAAAAAAAAAAAAAAB, q1 C U1 211 ldq u0, 40(up) C L1 212 addq q0, l0, x C U0 213C 1 214 negq cy, cymask C L0 215 unop C U1 216 unop C L1 217 cmpult x5555555555555555, x, p6 C U0 218C 2 219 cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1 220 lda n, -4(n) C L1 bookkeeping 221 unop 222 negq p6, t0 C L0 223C 3 224 negq p7, t1 C L0 225 and cymask, x5555555555555555, l0 C U1 226 addq p6, cy, cy 227 and t0, x5555555555555555, t0 228C 4 229 and t1, x5555555555555555, t1 230 addq p7, cy, cy 231 unop 232 addq t0, l0, l0 233C 5 234 addq t1, l0, l0 235 unop 236 stq x, 24(rp) C L1 237 lda up, 32(up) 238C 239 ldl r31, 256(up) C prefetch 240 unop 241 lda rp, 32(rp) 242 bge n, $Ltop C U1 243C *** MAIN LOOP END *** 244$Lend: 245 246 cmpult u3, cy, cy C L0 247 mulq u0, xAAAAAAAAAAAAAAAB, q0 C U1 248 unop 249 addq q1, l0, x C U0 250C 1 251 negq cy, cymask C L0 252 unop C U1 253 unop C L1 254 cmpult x5555555555555555, x, p6 C U0 255C 2 256 cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1 257 unop 258 unop 259 negq p6, t0 C L0 260C 3 261 negq p7, t1 C L0 262 and cymask, x5555555555555555, l0 C U1 263 addq p6, cy, cy 264 and t0, x5555555555555555, t0 265C 4 266 and t1, x5555555555555555, t1 267 addq p7, cy, cy 268 unop 269 addq t0, l0, l0 270C 5 271 addq t1, l0, l0 272 unop 273 stq x, 0(rp) C L1 274 unop 275$Lcj1: 276 cmpult u0, cy, cy C L0 277 addq q0, l0, x C U0 278 cmpult x5555555555555555, x, p6 C U0 279 cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1 280 addq p6, cy, cy 281 addq p7, cy, r0 282 stq x, 8(rp) C L1 283 284 ret r31,(r26),1 285EPILOGUE() 286ASM_END() 287 288C This is useful for playing with various schedules. 289C Expand as: one(0)one(1)one(2)one(3) 290define(`one',` 291C 0 292 cmpult `$'eval(($1+3)%4), cy, cy C L0 293 mulq `$'$1, xAAAAAAAAAAAAAAAB, `$'eval(4+$1%2) C U1 294 ldq `$'eval(($1+1)%4), eval($1*8+16)(up) C L1 295 addq `$'eval(4+($1+1)%2), l0, x C U0 296C 1 297 negq cy, cymask C L0 298 unop C U1 299 unop C L1 300 cmpult x5555555555555555, x, p6 C U0 301C 2 302 cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1 303 unop 304 unop 305 negq p6, t0 C L0 306C 3 307 negq p7, t1 C L0 308 and cymask, x5555555555555555, l0 C U1 309 addq p6, cy, cy 310 and t0, x5555555555555555, t0 311C 4 312 and t1, x5555555555555555, t1 313 addq p7, cy, cy 314 unop 315 addq t0, l0, l0 316C 5 317 addq t1, l0, l0 318 unop 319 stq x, eval($1*8)(rp) C L1 320 unop 321') 322