mod_34lsub1.asm revision 1.1.1.2
1dnl PowerPC-32 mpn_mod_34lsub1 -- mpn remainder mod 2^24-1. 2 3dnl Copyright 2002, 2003, 2005, 2006, 2007, 2012 Free Software Foundation, 4dnl Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21 22include(`../config.m4') 23 24 25C cycles/limb 26C 603e: - 27C 604e: - 28C 75x (G3): - 29C 7400,7410 (G4): 1 simple load-use scheduling results in 0.75 30C 744x,745x (G4+): 0.75 31C ppc970: 0.75 32C power4: - 33C power5: - 34 35C TODO 36C * Either start using the low-end masking constants, or remove them. 37C * Merge multiple feed-in cases into a parameterized code block. 38C * Reduce register usage. It should be possible to almost halve it. 39 40define(`up', `r3') 41define(`n', `r4') 42 43define(`a0', `v3') 44define(`a1', `v4') 45define(`a2', `v5') 46define(`c0', `v6') 47define(`c1', `v7') 48define(`c2', `v8') 49define(`z', `v9') 50define(`x0', `v10') 51define(`x1', `v11') 52define(`x2', `v12') 53define(`x3', `v13') 54define(`pv', `v14') 55define(`y0', `v0') 56define(`y1', `v1') 57define(`y2', `v2') 58define(`y3', `v15') 59 60ASM_START() 61PROLOGUE(mpn_mod_34lsub1) 62 cmpwi cr0, n, 20 C tuned cutoff point 63 bge L(large) 64 65 li r9, 0 C result accumulator 66 mulli r10, n, 0xb C 0xb = ceil(32/3) 67 srwi. r10, r10, 5 C r10 = floor(n/3), n < 32 68 beq L(small_tail) 69 mtctr r10 70 lwz r6, 0(up) 71 lwz r7, 4(up) 72 lwzu r8, 8(up) 73 subf n, r10, n 74 subf n, r10, n 75 subf n, r10, n 76 bdz L(small_end) 77 78 ALIGN(16) 79L(los): rlwinm r0, r6, 0,8,31 80 add r9, r9, r0 C add 24b from u0 81 srwi r0, r6, 24 82 lwz r6, 4(up) 83 rlwimi r0, r7, 8, 0x00ffff00 C --111100 84 add r9, r9, r0 C add 8b from u0 and 16b from u1 85 srwi r0, r7, 16 86 lwz r7, 8(up) 87 rlwimi r0, r8, 16, 0x00ff0000 C --221111 88 add r9, r9, r0 C add 16b from u1 and 8b from u2 89 srwi r0, r8, 8 C --222222 90 lwzu r8, 12(up) 91 add r9, r9, r0 C add 24b from u2 92 bdnz L(los) 93L(small_end): 94 rlwinm r0, r6, 0,8,31 95 add r9, r9, r0 C add 24b from u0 96 srwi r0, r6, 24 97 rlwimi r0, r7, 8, 0x00ffff00 C --111100 98 add r9, r9, r0 C add 8b from u0 and 16b from u1 99 srwi r0, r7, 16 100 rlwimi r0, r8, 16, 0x00ff0000 C --221111 101 add r9, r9, r0 C add 16b from u1 and 8b from u2 102 srwi r0, r8, 8 C --222222 103 add r9, r9, r0 C add 24b from u2 104 105 addi up, up, 4 106 rlwinm r0, r9, 0,8,31 107 srwi r9, r9, 24 108 add r9, r9, r0 109 110L(small_tail): 111 cmpi cr0, n, 1 112 blt L(ret) 113 114 lwz r6, 0(up) 115 rlwinm r0, r6, 0,8,31 116 srwi r6, r6, 24 117 add r9, r9, r0 118 add r9, r9, r6 119 120 beq L(ret) 121 122 lwz r6, 4(up) 123 rlwinm r0, r6, 8,8,23 124 srwi r6, r6, 16 125 add r9, r9, r0 126 add r9, r9, r6 127 128L(ret): mr r3, r9 129 blr 130 131 132L(large): 133 mfspr r10, 256 134 oris r0, r10, 0xffff C Set VRSAVE bit 0-15 135 mtspr 256, r0 136 137 andi. r7, up, 15 138 vxor a0, v0, v0 139 lis r9, 0xaaaa 140 vxor a1, v0, v0 141 ori r9, r9, 0xaaab 142 vxor a2, v0, v0 143 li r5, 16 144 vxor c0, v0, v0 145 li r6, 32 146 vxor c1, v0, v0 147 LEAL( r11, cnsts) C CAUTION clobbers r0 for elf, darwin 148 vxor c2, v0, v0 149 vxor z, v0, v0 150 151 beq L(aligned16) 152 153 cmpwi cr7, r7, 8 154 bge cr7, L(na4) 155 156 lvx a2, 0, up 157 addi up, up, 16 158 vsldoi a2, a2, z, 4 159 vsldoi a2, z, a2, 12 160 161 addi n, n, 9 162 mulhwu r0, n, r9 163 srwi r0, r0, 3 C r0 = floor(n/12) 164 mtctr r0 165 166 mulli r8, r0, 12 167 subf n, r8, n 168 b L(2) 169 170L(na4): bne cr7, L(na8) 171 172 lvx a1, 0, up 173 addi up, up, -16 174 vsldoi a1, a1, z, 8 175 vsldoi a1, z, a1, 8 176 177 addi n, n, 6 178 mulhwu r0, n, r9 179 srwi r0, r0, 3 C r0 = floor(n/12) 180 mtctr r0 181 182 mulli r8, r0, 12 183 subf n, r8, n 184 b L(1) 185 186L(na8): 187 lvx a0, 0, up 188 vsldoi a0, a0, z, 12 189 vsldoi a0, z, a0, 4 190 191 addi n, n, 3 192 mulhwu r0, n, r9 193 srwi r0, r0, 3 C r0 = floor(n/12) 194 mtctr r0 195 196 mulli r8, r0, 12 197 subf n, r8, n 198 b L(0) 199 200L(aligned16): 201 mulhwu r0, n, r9 202 srwi r0, r0, 3 C r0 = floor(n/12) 203 mtctr r0 204 205 mulli r8, r0, 12 206 subf n, r8, n 207 208 lvx a0, 0, up 209L(0): lvx a1, r5, up 210L(1): lvx a2, r6, up 211 addi up, up, 48 212L(2): bdz L(end) 213 li r12, 256 214 li r9, 288 215 ALIGN(32) 216L(top): 217 lvx v0, 0, up 218 vaddcuw v10, a0, v0 219 vadduwm a0, a0, v0 220 vadduwm c0, c0, v10 221 222 lvx v1, r5, up 223 vaddcuw v10, a1, v1 224 vadduwm a1, a1, v1 225 vadduwm c1, c1, v10 226 227 lvx v2, r6, up 228 dcbt up, r12 229 dcbt up, r9 230 addi up, up, 48 231 vaddcuw v10, a2, v2 232 vadduwm a2, a2, v2 233 vadduwm c2, c2, v10 234 bdnz L(top) 235 236L(end): 237C n = 0...11 238 cmpwi cr0, n, 0 239 beq L(sum) 240 cmpwi cr0, n, 4 241 ble L(tail.1..4) 242 cmpwi cr0, n, 8 243 ble L(tail.5..8) 244 245L(tail.9..11): 246 lvx v0, 0, up 247 vaddcuw v10, a0, v0 248 vadduwm a0, a0, v0 249 vadduwm c0, c0, v10 250 251 lvx v1, r5, up 252 vaddcuw v10, a1, v1 253 vadduwm a1, a1, v1 254 vadduwm c1, c1, v10 255 256 lvx v2, r6, up 257 258 addi r8, r11, 96 259 rlwinm r3, n ,4,26,27 260 lvx v11, r3, r8 261 vand v2, v2, v11 262 263 vaddcuw v10, a2, v2 264 vadduwm a2, a2, v2 265 vadduwm c2, c2, v10 266 b L(sum) 267 268L(tail.5..8): 269 lvx v0, 0, up 270 vaddcuw v10, a0, v0 271 vadduwm a0, a0, v0 272 vadduwm c0, c0, v10 273 274 lvx v1, r5, up 275 276 addi r8, r11, 96 277 rlwinm r3, n ,4,26,27 278 lvx v11, r3, r8 279 vand v1, v1, v11 280 281 vaddcuw v10, a1, v1 282 vadduwm a1, a1, v1 283 vadduwm c1, c1, v10 284 b L(sum) 285 286L(tail.1..4): 287 lvx v0, 0, up 288 289 addi r8, r11, 96 290 rlwinm r3, n ,4,26,27 291 lvx v11, r3, r8 292 vand v0, v0, v11 293 294 vaddcuw v10, a0, v0 295 vadduwm a0, a0, v0 296 vadduwm c0, c0, v10 297 298L(sum): lvx pv, 0, r11 299 vperm x0, a0, z, pv C extract 4 24-bit field from a0 300 vperm y0, c2, z, pv 301 lvx pv, r5, r11 302 vperm x1, a1, z, pv C extract 4 24-bit field from a1 303 vperm y1, c0, z, pv C extract 4 24-bit field from a1 304 lvx pv, r6, r11 305 vperm x2, a2, z, pv C extract 4 24-bit field from a1 306 vperm y2, c1, z, pv C extract 4 24-bit field from a1 307 li r10, 48 308 lvx pv, r10, r11 309 vperm x3, a0, z, pv C extract remaining/partial a0 fields 310 vperm y3, c2, z, pv C extract remaining/partial a0 fields 311 li r10, 64 312 lvx pv, r10, r11 313 vperm x3, a1, x3, pv C insert remaining/partial a1 fields 314 vperm y3, c0, y3, pv C insert remaining/partial a1 fields 315 li r10, 80 316 lvx pv, r10, r11 317 vperm x3, a2, x3, pv C insert remaining/partial a2 fields 318 vperm y3, c1, y3, pv C insert remaining/partial a2 fields 319 320C We now have 4 128-bit accumulators to sum 321 vadduwm x0, x0, x1 322 vadduwm x2, x2, x3 323 vadduwm x0, x0, x2 324 325 vadduwm y0, y0, y1 326 vadduwm y2, y2, y3 327 vadduwm y0, y0, y2 328 329 vadduwm x0, x0, y0 330 331C Reduce 32-bit fields 332 vsumsws x0, x0, z 333 334 li r7, -16 C FIXME: does all ppc32 ABIs... 335 stvx x0, r7, r1 C FIXME: ...support storing below sp? 336 lwz r3, -4(r1) 337 338 mtspr 256, r10 339 blr 340EPILOGUE() 341 342C load | v0 | v1 | v2 | 343C acc | a0 | a1 | a2 | 344C carry | c0 | c1 | c2 | 345C | 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 128 346C |---|---|---|---|---|---|---|---|---|---|---|---| 32 347C | | | | | | | | | | | | | | | | | 24 348C | | | | | | | | | 48 349 350C $---------------$---------------$---------------$---------------$ 351C | . . . . . . . . . . . . . . . | 352C |_______________________________________________________________| 353C | | | | | | | 354C <-hi16-> <--- 24 --> <--- 24 --> <--- 24 --> <--- 24 --> <-lo16-> 355 356 357DEF_OBJECT(cnsts,16) 358C Permutation vectors in the order they are used above 359C # 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 360 .byte 0x10,0x01,0x02,0x03, 0x10,0x06,0x07,0x00, 0x10,0x0b,0x04,0x05, 0x10,0x08,0x09,0x0a C a0 361 .byte 0x10,0x07,0x00,0x01, 0x10,0x04,0x05,0x06, 0x10,0x09,0x0a,0x0b, 0x10,0x0e,0x0f,0x08 C a1 362 .byte 0x10,0x00,0x01,0x02, 0x10,0x05,0x06,0x07, 0x10,0x0a,0x0b,0x04, 0x10,0x0f,0x08,0x09 C a2 363 .byte 0x10,0x0d,0x0e,0x0f, 0x10,0x10,0x10,0x0c, 0x10,0x10,0x10,0x10, 0x10,0x10,0x10,0x10 C part a0 364 .byte 0x10,0x11,0x12,0x13, 0x10,0x02,0x03,0x17, 0x10,0x10,0x0c,0x0d, 0x10,0x10,0x10,0x10 C part a1 365 .byte 0x10,0x11,0x12,0x13, 0x10,0x15,0x16,0x17, 0x10,0x03,0x1a,0x1b, 0x10,0x0c,0x0d,0x0e C part a2 366C Masks for high end of number 367 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 368 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 369 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 370 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 371C Masks for low end of number 372C .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 373C .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 374C .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 375C .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff 376END_OBJECT(cnsts) 377