mod_34lsub1.asm revision 1.1.1.1
1dnl IA-64 mpn_mod_34lsub1 2 3dnl Contributed to the GNU project by Torbjorn Granlund. 4 5dnl Copyright 2003, 2004, 2005, 2010 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C cycles/limb 25C Itanium: ? 26C Itanium 2: 1 27 28 29C INPUT PARAMETERS 30define(`up', `r32') 31define(`n', `r33') 32 33C Some useful aliases for registers we use 34define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') 35define(`a0',`r17') define(`a1',`r18') define(`a2',`r19') 36define(`c0',`r20') define(`c1',`r21') define(`c2',`r22') 37 38C This is a fairly simple-minded implementation. One could approach 0.67 c/l 39C with a more sophisticated implementation. If we're really crazy, we could 40C super-unroll, storing carries just in predicate registers, then copy them to 41C a general register, and population count them from there. That'd bring us 42C close to 3 insn/limb, for nearly 0.5 c/l. 43 44C Computing n/3 needs 16 cycles, which is a lot of startup overhead. 45C We therefore use a plain while-style loop: 46C add n = -3, n 47C cmp.le p9, p0 = 3, n 48C (p9) br.cond .Loop 49C Alternatively, we could table n/3 for, say, n < 256, and predicate the 50C 16-cycle code. 51 52C The summing-up code at the end was written quickly, and could surely be 53C vastly improved. 54 55ASM_START() 56PROLOGUE(mpn_mod_34lsub1) 57 .prologue 58 .save ar.lc, r2 59 .body 60ifdef(`HAVE_ABI_32',` 61 addp4 up = 0, up C M I 62 zxt4 n = n C I 63 ;; 64') 65 66ifelse(0,1,` 67 movl r14 = 0xAAAAAAAAAAAAAAAB 68 ;; 69 setf.sig f6 = r14 70 setf.sig f7 = r33 71 ;; 72 xmpy.hu f6 = f6, f7 73 ;; 74 getf.sig r8 = f6 75 ;; 76 shr.u r8 = r8, 1 C Loop count 77 ;; 78 mov.i ar.lc = r8 79') 80 81 ld8 u0 = [up], 8 82 cmp.ne p9, p0 = 1, n 83 (p9) br L(gt1) 84 ;; 85 shr.u r8 = u0, 48 86 dep.z r27 = u0, 0, 48 87 ;; 88 add r8 = r8, r27 89 br.ret.sptk.many b0 90 91 92L(gt1): 93.mmi; nop.m 0 94 mov a0 = 0 95 add n = -2, n 96.mmi; mov c0 = 0 97 mov c1 = 0 98 mov c2 = 0 99 ;; 100.mmi; ld8 u1 = [up], 8 101 mov a1 = 0 102 cmp.ltu p6, p0 = r0, r0 C clear p6 103.mmb; cmp.gt p9, p0 = 3, n 104 mov a2 = 0 105 (p9) br.cond.dptk L(end) 106 ;; 107 108 ALIGN(32) 109L(top): 110.mmi; ld8 u2 = [up], 8 111 (p6) add c0 = 1, c0 112 cmp.ltu p7, p0 = a0, u0 113.mmb; sub a0 = a0, u0 114 add n = -3, n 115 nop.b 0 116 ;; 117.mmi; ld8 u0 = [up], 8 118 (p7) add c1 = 1, c1 119 cmp.ltu p8, p0 = a1, u1 120.mmb; sub a1 = a1, u1 121 cmp.le p9, p0 = 3, n 122 nop.b 0 123 ;; 124.mmi; ld8 u1 = [up], 8 125 (p8) add c2 = 1, c2 126 cmp.ltu p6, p0 = a2, u2 127.mmb; sub a2 = a2, u2 128 nop.m 0 129dnl br.cloop.dptk L(top) 130 (p9) br.cond.dptk L(top) 131 ;; 132 133L(end): 134 cmp.eq p10, p0 = 0, n 135 cmp.eq p11, p0 = 1, n 136 (p10) br L(0) 137 138L(2): 139.mmi; ld8 u2 = [up], 8 140 (p6) add c0 = 1, c0 141 cmp.ltu p7, p0 = a0, u0 142.mmb; sub a0 = a0, u0 143 nop.m 0 144 (p11) br L(1) 145 ;; 146 ld8 u0 = [up], 8 147 (p7) add c1 = 1, c1 148 cmp.ltu p8, p0 = a1, u1 149 sub a1 = a1, u1 150 ;; 151 (p8) add c2 = 1, c2 152 cmp.ltu p6, p0 = a2, u2 153 sub a2 = a2, u2 154 ;; 155 (p6) add c0 = 1, c0 156 cmp.ltu p7, p0 = a0, u0 157 sub a0 = a0, u0 158 ;; 159 (p7) add c1 = 1, c1 160 br L(com) 161 162 163L(1): 164 (p7) add c1 = 1, c1 165 cmp.ltu p8, p0 = a1, u1 166 sub a1 = a1, u1 167 ;; 168 (p8) add c2 = 1, c2 169 cmp.ltu p6, p0 = a2, u2 170 sub a2 = a2, u2 171 ;; 172 (p6) add c0 = 1, c0 173 br L(com) 174 175 176L(0): 177 (p6) add c0 = 1, c0 178 cmp.ltu p7, p0 = a0, u0 179 sub a0 = a0, u0 180 ;; 181 (p7) add c1 = 1, c1 182 cmp.ltu p8, p0 = a1, u1 183 sub a1 = a1, u1 184 ;; 185 (p8) add c2 = 1, c2 186 187L(com): 188C | a2 | a1 | a0 | 189C | | | | | 190 shr.u r24 = a0, 48 C 16 bits 191 shr.u r25 = a1, 32 C 32 bits 192 shr.u r26 = a2, 16 C 48 bits 193 ;; 194 shr.u r10 = c0, 48 C 16 bits, always zero 195 shr.u r11 = c1, 32 C 32 bits 196 shr.u r30 = c2, 16 C 48 bits 197 ;; 198 dep.z r27 = a0, 0, 48 C 48 bits 199 dep.z r28 = a1, 16, 32 C 48 bits 200 dep.z r29 = a2, 32, 16 C 48 bits 201 dep.z r31 = c0, 0, 48 C 48 bits 202 dep.z r14 = c1, 16, 32 C 48 bits 203 dep.z r15 = c2, 32, 16 C 48 bits 204 ;; 205.mmi; add r24 = r24, r25 206 add r26 = r26, r27 207 add r28 = r28, r29 208.mmi; add r10 = r10, r11 209 add r30 = r30, r31 210 add r14 = r14, r15 211 ;; 212 movl r8 = 0xffffffffffff0 213 add r24 = r24, r26 214 add r10 = r10, r30 215 ;; 216 add r24 = r24, r28 217 add r10 = r10, r14 218 ;; 219 sub r8 = r8, r24 220 ;; 221 add r8 = r8, r10 222 br.ret.sptk.many b0 223EPILOGUE() 224ASM_END() 225