1dnl PowerPC-32 mpn_mod_34lsub1 -- mpn remainder mod 2^24-1. 2 3dnl Copyright 2002, 2003, 2005, 2006, 2007 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20 21include(`../config.m4') 22 23 24C cycles/limb 25C 603e: - 26C 604e: - 27C 75x (G3): - 28C 7400,7410 (G4): 1 simple load-use scheduling results in 0.75 29C 744x,745x (G4+): 0.75 30C ppc970: 0.75 31C power4: - 32C power5: - 33 34C TODO 35C * Either start using the low-end masking constants, or remove them. 36C * Merge multiple feed-in cases into a parameterized code block. 37C * Reduce register usage. It should be possible to almost halve it. 38 39define(`up', `r3') 40define(`n', `r4') 41 42define(`a0', `v3') 43define(`a1', `v4') 44define(`a2', `v5') 45define(`c0', `v6') 46define(`c1', `v7') 47define(`c2', `v8') 48define(`z', `v9') 49define(`x0', `v10') 50define(`x1', `v11') 51define(`x2', `v12') 52define(`x3', `v13') 53define(`pv', `v14') 54define(`y0', `v0') 55define(`y1', `v1') 56define(`y2', `v2') 57define(`y3', `v15') 58 59ASM_START() 60PROLOGUE(mpn_mod_34lsub1) 61 cmpwi cr0, n, 20 C tuned cutoff point 62 bge L(large) 63 64 li r9, 0 C result accumulator 65 mulli r10, n, 0xb C 0xb = ceil(32/3) 66 srwi. r10, r10, 5 C r10 = floor(n/3), n < 32 67 beq L(small_tail) 68 mtctr r10 69 lwz r6, 0(up) 70 lwz r7, 4(up) 71 lwzu r8, 8(up) 72 subf n, r10, n 73 subf n, r10, n 74 subf n, r10, n 75 bdz L(small_end) 76 77 ALIGN(16) 78L(los): rlwinm r0, r6, 0,8,31 79 add r9, r9, r0 C add 24b from u0 80 srwi r0, r6, 24 81 lwz r6, 4(up) 82 rlwimi r0, r7, 8, 0x00ffff00 C --111100 83 add r9, r9, r0 C add 8b from u0 and 16b from u1 84 srwi r0, r7, 16 85 lwz r7, 8(up) 86 rlwimi r0, r8, 16, 0x00ff0000 C --221111 87 add r9, r9, r0 C add 16b from u1 and 8b from u2 88 srwi r0, r8, 8 C --222222 89 lwzu r8, 12(up) 90 add r9, r9, r0 C add 24b from u2 91 bdnz L(los) 92L(small_end): 93 rlwinm r0, r6, 0,8,31 94 add r9, r9, r0 C add 24b from u0 95 srwi r0, r6, 24 96 rlwimi r0, r7, 8, 0x00ffff00 C --111100 97 add r9, r9, r0 C add 8b from u0 and 16b from u1 98 srwi r0, r7, 16 99 rlwimi r0, r8, 16, 0x00ff0000 C --221111 100 add r9, r9, r0 C add 16b from u1 and 8b from u2 101 srwi r0, r8, 8 C --222222 102 add r9, r9, r0 C add 24b from u2 103 104 addi up, up, 4 105 rlwinm r0, r9, 0,8,31 106 srwi r9, r9, 24 107 add r9, r9, r0 108 109L(small_tail): 110 cmpi cr0, n, 1 111 blt L(ret) 112 113 lwz r6, 0(up) 114 rlwinm r0, r6, 0,8,31 115 srwi r6, r6, 24 116 add r9, r9, r0 117 add r9, r9, r6 118 119 beq L(ret) 120 121 lwz r6, 4(up) 122 rlwinm r0, r6, 8,8,23 123 srwi r6, r6, 16 124 add r9, r9, r0 125 add r9, r9, r6 126 127L(ret): mr r3, r9 128 blr 129 130 131L(large): 132 mfspr r10, 256 133 oris r0, r10, 0xffff C Set VRSAVE bit 0-15 134 mtspr 256, r0 135 136 andi. r7, up, 15 137 vxor a0, v0, v0 138 lis r0, 0xaaaa 139 vxor a1, v0, v0 140 ori r0, r0, 0xaaab 141 vxor a2, v0, v0 142 li r5, 16 143 vxor c0, v0, v0 144 li r6, 32 145 vxor c1, v0, v0 146 LEAL( r11, cnsts) 147 vxor c2, v0, v0 148 vxor z, v0, v0 149 150 beq L(aligned16) 151 152 cmpwi cr7, r7, 8 153 bge cr7, L(na4) 154 155 lvx a2, 0, up 156 addi up, up, 16 157 vsldoi a2, a2, z, 4 158 vsldoi a2, z, a2, 12 159 160 addi n, n, 9 161 mulhwu r0, n, r0 162 srwi r0, r0, 3 C r0 = floor(n/12) 163 mtctr r0 164 165 mulli r8, r0, 12 166 subf n, r8, n 167 b L(2) 168 169L(na4): bne cr7, L(na8) 170 171 lvx a1, 0, up 172 addi up, up, -16 173 vsldoi a1, a1, z, 8 174 vsldoi a1, z, a1, 8 175 176 addi n, n, 6 177 mulhwu r0, n, r0 178 srwi r0, r0, 3 C r0 = floor(n/12) 179 mtctr r0 180 181 mulli r8, r0, 12 182 subf n, r8, n 183 b L(1) 184 185L(na8): 186 lvx a0, 0, up 187 vsldoi a0, a0, z, 12 188 vsldoi a0, z, a0, 4 189 190 addi n, n, 3 191 mulhwu r0, n, r0 192 srwi r0, r0, 3 C r0 = floor(n/12) 193 mtctr r0 194 195 mulli r8, r0, 12 196 subf n, r8, n 197 b L(0) 198 199L(aligned16): 200 mulhwu r0, n, r0 201 srwi r0, r0, 3 C r0 = floor(n/12) 202 mtctr r0 203 204 mulli r8, r0, 12 205 subf n, r8, n 206 207 lvx a0, 0, up 208L(0): lvx a1, r5, up 209L(1): lvx a2, r6, up 210 addi up, up, 48 211L(2): bdz L(end) 212 li r12, 256 213 li r9, 288 214 ALIGN(32) 215L(top): 216 lvx v0, 0, up 217 vaddcuw v10, a0, v0 218 vadduwm a0, a0, v0 219 vadduwm c0, c0, v10 220 221 lvx v1, r5, up 222 vaddcuw v10, a1, v1 223 vadduwm a1, a1, v1 224 vadduwm c1, c1, v10 225 226 lvx v2, r6, up 227 dcbt up, r12 228 dcbt up, r9 229 addi up, up, 48 230 vaddcuw v10, a2, v2 231 vadduwm a2, a2, v2 232 vadduwm c2, c2, v10 233 bdnz L(top) 234 235L(end): 236C n = 0...11 237 cmpwi cr0, n, 0 238 beq L(sum) 239 cmpwi cr0, n, 4 240 ble L(tail.1..4) 241 cmpwi cr0, n, 8 242 ble L(tail.5..8) 243 244L(tail.9..11): 245 lvx v0, 0, up 246 vaddcuw v10, a0, v0 247 vadduwm a0, a0, v0 248 vadduwm c0, c0, v10 249 250 lvx v1, r5, up 251 vaddcuw v10, a1, v1 252 vadduwm a1, a1, v1 253 vadduwm c1, c1, v10 254 255 lvx v2, r6, up 256 257 addi r8, r11, 96 258 rlwinm r3, n ,4,26,27 259 lvx v11, r3, r8 260 vand v2, v2, v11 261 262 vaddcuw v10, a2, v2 263 vadduwm a2, a2, v2 264 vadduwm c2, c2, v10 265 b L(sum) 266 267L(tail.5..8): 268 lvx v0, 0, up 269 vaddcuw v10, a0, v0 270 vadduwm a0, a0, v0 271 vadduwm c0, c0, v10 272 273 lvx v1, r5, up 274 275 addi r8, r11, 96 276 rlwinm r3, n ,4,26,27 277 lvx v11, r3, r8 278 vand v1, v1, v11 279 280 vaddcuw v10, a1, v1 281 vadduwm a1, a1, v1 282 vadduwm c1, c1, v10 283 b L(sum) 284 285L(tail.1..4): 286 lvx v0, 0, up 287 288 addi r8, r11, 96 289 rlwinm r3, n ,4,26,27 290 lvx v11, r3, r8 291 vand v0, v0, v11 292 293 vaddcuw v10, a0, v0 294 vadduwm a0, a0, v0 295 vadduwm c0, c0, v10 296 297L(sum): lvx pv, 0, r11 298 vperm x0, a0, z, pv C extract 4 24-bit field from a0 299 vperm y0, c2, z, pv 300 lvx pv, r5, r11 301 vperm x1, a1, z, pv C extract 4 24-bit field from a1 302 vperm y1, c0, z, pv C extract 4 24-bit field from a1 303 lvx pv, r6, r11 304 vperm x2, a2, z, pv C extract 4 24-bit field from a1 305 vperm y2, c1, z, pv C extract 4 24-bit field from a1 306 li r10, 48 307 lvx pv, r10, r11 308 vperm x3, a0, z, pv C extract remaining/partial a0 fields 309 vperm y3, c2, z, pv C extract remaining/partial a0 fields 310 li r10, 64 311 lvx pv, r10, r11 312 vperm x3, a1, x3, pv C insert remaining/partial a1 fields 313 vperm y3, c0, y3, pv C insert remaining/partial a1 fields 314 li r10, 80 315 lvx pv, r10, r11 316 vperm x3, a2, x3, pv C insert remaining/partial a2 fields 317 vperm y3, c1, y3, pv C insert remaining/partial a2 fields 318 319C We now have 4 128-bit accumulators to sum 320 vadduwm x0, x0, x1 321 vadduwm x2, x2, x3 322 vadduwm x0, x0, x2 323 324 vadduwm y0, y0, y1 325 vadduwm y2, y2, y3 326 vadduwm y0, y0, y2 327 328 vadduwm x0, x0, y0 329 330C Reduce 32-bit fields 331 vsumsws x0, x0, z 332 333 li r7, -16 C FIXME: does all ppc32 ABIs... 334 stvx x0, r7, r1 C FIXME: ...support storing below sp? 335 lwz r3, -4(r1) 336 337 mtspr 256, r10 338 blr 339EPILOGUE() 340 341C load | v0 | v1 | v2 | 342C acc | a0 | a1 | a2 | 343C carry | c0 | c1 | c2 | 344C | 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 128 345C |---|---|---|---|---|---|---|---|---|---|---|---| 32 346C | | | | | | | | | | | | | | | | | 24 347C | | | | | | | | | 48 348 349C $---------------$---------------$---------------$---------------$ 350C | . . . . . . . . . . . . . . . | 351C |_______________________________________________________________| 352C | | | | | | | 353C <-hi16-> <--- 24 --> <--- 24 --> <--- 24 --> <--- 24 --> <-lo16-> 354 355 356DEF_OBJECT(cnsts,16) 357C Permutation vectors in the order they are used above 358C # 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 359 .byte 0x10,0x01,0x02,0x03, 0x10,0x06,0x07,0x00, 0x10,0x0b,0x04,0x05, 0x10,0x08,0x09,0x0a C a0 360 .byte 0x10,0x07,0x00,0x01, 0x10,0x04,0x05,0x06, 0x10,0x09,0x0a,0x0b, 0x10,0x0e,0x0f,0x08 C a1 361 .byte 0x10,0x00,0x01,0x02, 0x10,0x05,0x06,0x07, 0x10,0x0a,0x0b,0x04, 0x10,0x0f,0x08,0x09 C a2 362 .byte 0x10,0x0d,0x0e,0x0f, 0x10,0x10,0x10,0x0c, 0x10,0x10,0x10,0x10, 0x10,0x10,0x10,0x10 C part a0 363 .byte 0x10,0x11,0x12,0x13, 0x10,0x02,0x03,0x17, 0x10,0x10,0x0c,0x0d, 0x10,0x10,0x10,0x10 C part a1 364 .byte 0x10,0x11,0x12,0x13, 0x10,0x15,0x16,0x17, 0x10,0x03,0x1a,0x1b, 0x10,0x0c,0x0d,0x0e C part a2 365C Masks for high end of number 366 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 367 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 368 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 369 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 370C Masks for low end of number 371C .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 372C .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 373C .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 374C .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff 375END_OBJECT(cnsts) 376