lorrshift.asm revision 1.1.1.2
1dnl IA-64 mpn_lshift/mpn_rshift. 2 3dnl Contributed to the GNU project by Torbjorn Granlund. 4 5dnl Copyright 2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation, 6dnl Inc. 7 8dnl This file is part of the GNU MP Library. 9 10dnl The GNU MP Library is free software; you can redistribute it and/or modify 11dnl it under the terms of the GNU Lesser General Public License as published 12dnl by the Free Software Foundation; either version 3 of the License, or (at 13dnl your option) any later version. 14 15dnl The GNU MP Library is distributed in the hope that it will be useful, but 16dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 17dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 18dnl License for more details. 19 20dnl You should have received a copy of the GNU Lesser General Public License 21dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 22 23include(`../config.m4') 24 25C cycles/limb 26C Itanium: 2 27C Itanium 2: 1 28 29C This code is scheduled deeply since the plain shift instructions shr and shl 30C have a latency of 4 (on Itanium) or 3 (on Itanium 2). Poor scheduling of 31C these instructions cause a 10 cycle replay trap on Itanium. 32 33C The ld8 scheduling should probably be decreased to make the function smaller. 34C Good lfetch will make sure we never stall anyway. 35 36C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair 37C at cycle 2. Judicious use of predicates could allow us to issue more ld8's 38C in the prologue. 39 40 41C INPUT PARAMETERS 42define(`rp', `r32') 43define(`up', `r33') 44define(`n', `r34') 45define(`cnt',`r35') 46 47define(`tnc',`r9') 48 49ifdef(`OPERATION_lshift',` 50 define(`FSH',`shl') 51 define(`BSH',`shr.u') 52 define(`UPD',`-8') 53 define(`POFF',`-512') 54 define(`PUPD',`-32') 55 define(`func',`mpn_lshift') 56') 57ifdef(`OPERATION_rshift',` 58 define(`FSH',`shr.u') 59 define(`BSH',`shl') 60 define(`UPD',`8') 61 define(`POFF',`512') 62 define(`PUPD',`32') 63 define(`func',`mpn_rshift') 64') 65 66MULFUNC_PROLOGUE(mpn_lshift mpn_rshift) 67 68ASM_START() 69PROLOGUE(func) 70 .prologue 71 .save ar.lc, r2 72 .body 73ifdef(`HAVE_ABI_32', 74` addp4 rp = 0, rp C M I 75 addp4 up = 0, up C M I 76 sxt4 n = n C M I 77 zxt4 cnt = cnt C I 78 ;; 79') 80 81 {.mmi; cmp.lt p14, p15 = 4, n C M I 82 and r14 = 3, n C M I 83 mov.i r2 = ar.lc C I0 84}{.mmi; add r15 = -1, n C M I 85 sub tnc = 64, cnt C M I 86 add r16 = -5, n 87 ;; 88}{.mmi; cmp.eq p6, p0 = 1, r14 C M I 89 cmp.eq p7, p0 = 2, r14 C M I 90 shr.u n = r16, 2 C I0 91}{.mmi; cmp.eq p8, p0 = 3, r14 C M I 92ifdef(`OPERATION_lshift', 93` shladd up = r15, 3, up C M I 94 shladd rp = r15, 3, rp') C M I 95 ;; 96}{.mmi; add r11 = POFF, up C M I 97 ld8 r10 = [up], UPD C M01 98 mov.i ar.lc = n C I0 99}{.bbb; 100 (p6) br.dptk .Lb01 101 (p7) br.dptk .Lb10 102 (p8) br.dptk .Lb11 103 ;; } 104 105.Lb00: ld8 r19 = [up], UPD 106 ;; 107 ld8 r16 = [up], UPD 108 ;; 109 ld8 r17 = [up], UPD 110 BSH r8 = r10, tnc C function return value 111 ;; 112 FSH r24 = r10, cnt 113 BSH r25 = r19, tnc 114 (p14) br.cond.dptk .grt4 115 ;; 116 FSH r26 = r19, cnt 117 BSH r27 = r16, tnc 118 ;; 119 FSH r20 = r16, cnt 120 BSH r21 = r17, tnc 121 ;; 122 or r14 = r25, r24 123 FSH r22 = r17, cnt 124 BSH r23 = r10, tnc 125 br .Lr4 126 127.grt4: ld8 r18 = [up], UPD 128 FSH r26 = r19, cnt 129 BSH r27 = r16, tnc 130 ;; 131 ld8 r19 = [up], UPD 132 FSH r20 = r16, cnt 133 BSH r21 = r17, tnc 134 ;; 135 ld8 r16 = [up], UPD 136 FSH r22 = r17, cnt 137 BSH r23 = r18, tnc 138 ;; 139 or r14 = r25, r24 140 ld8 r17 = [up], UPD 141 br.cloop.dpnt .Ltop 142 br .Lbot 143 144.Lb01: 145 (p15) BSH r8 = r10, tnc C function return value I 146 (p15) FSH r22 = r10, cnt C I 147 (p15) br.cond.dptk .Lr1 C return B 148 149.grt1: ld8 r18 = [up], UPD 150 ;; 151 ld8 r19 = [up], UPD 152 BSH r8 = r10, tnc C function return value 153 ;; 154 ld8 r16 = [up], UPD 155 FSH r22 = r10, cnt 156 BSH r23 = r18, tnc 157 ;; 158 ld8 r17 = [up], UPD 159 FSH r24 = r18, cnt 160 BSH r25 = r19, tnc 161 br.cloop.dpnt .grt5 162 ;; 163 or r15 = r23, r22 164 FSH r26 = r19, cnt 165 BSH r27 = r16, tnc 166 ;; 167 FSH r20 = r16, cnt 168 BSH r21 = r17, tnc 169 br .Lr5 170 171.grt5: ld8 r18 = [up], UPD 172 FSH r26 = r19, cnt 173 BSH r27 = r16, tnc 174 ;; 175 ld8 r19 = [up], UPD 176 FSH r20 = r16, cnt 177 BSH r21 = r17, tnc 178 ;; 179 or r15 = r23, r22 180 ld8 r16 = [up], UPD 181 br .LL01 182 183 184.Lb10: ld8 r17 = [up], UPD 185 (p14) br.cond.dptk .grt2 186 187 BSH r8 = r10, tnc C function return value 188 ;; 189 FSH r20 = r10, cnt 190 BSH r21 = r17, tnc 191 ;; 192 or r14 = r21, r20 193 FSH r22 = r17, cnt 194 br .Lr2 C return 195 196.grt2: ld8 r18 = [up], UPD 197 BSH r8 = r10, tnc C function return value 198 ;; 199 ld8 r19 = [up], UPD 200 FSH r20 = r10, cnt 201 BSH r21 = r17, tnc 202 ;; 203 ld8 r16 = [up], UPD 204 FSH r22 = r17, cnt 205 BSH r23 = r18, tnc 206 ;; 207 {.mmi; ld8 r17 = [up], UPD 208 or r14 = r21, r20 209 FSH r24 = r18, cnt 210}{.mib; nop 0 211 BSH r25 = r19, tnc 212 br.cloop.dpnt .grt6 213 ;; } 214 215 FSH r26 = r19, cnt 216 BSH r27 = r16, tnc 217 br .Lr6 218 219.grt6: ld8 r18 = [up], UPD 220 FSH r26 = r19, cnt 221 BSH r27 = r16, tnc 222 ;; 223 ld8 r19 = [up], UPD 224 br .LL10 225 226 227.Lb11: ld8 r16 = [up], UPD 228 ;; 229 ld8 r17 = [up], UPD 230 BSH r8 = r10, tnc C function return value 231 (p14) br.cond.dptk .grt3 232 ;; 233 234 FSH r26 = r10, cnt 235 BSH r27 = r16, tnc 236 ;; 237 FSH r20 = r16, cnt 238 BSH r21 = r17, tnc 239 ;; 240 or r15 = r27, r26 241 FSH r22 = r17, cnt 242 br .Lr3 C return 243 244.grt3: ld8 r18 = [up], UPD 245 FSH r26 = r10, cnt 246 BSH r27 = r16, tnc 247 ;; 248 ld8 r19 = [up], UPD 249 FSH r20 = r16, cnt 250 BSH r21 = r17, tnc 251 ;; 252 ld8 r16 = [up], UPD 253 FSH r22 = r17, cnt 254 BSH r23 = r18, tnc 255 ;; 256 ld8 r17 = [up], UPD 257 br.cloop.dpnt .grt7 258 259 or r15 = r27, r26 260 FSH r24 = r18, cnt 261 BSH r25 = r19, tnc 262 br .Lr7 263 264.grt7: or r15 = r27, r26 265 FSH r24 = r18, cnt 266 BSH r25 = r19, tnc 267 ld8 r18 = [up], UPD 268 br .LL11 269 270C *** MAIN LOOP START *** 271 ALIGN(32) 272.Ltop: 273 {.mmi; st8 [rp] = r14, UPD C M2 274 or r15 = r27, r26 C M3 275 FSH r24 = r18, cnt C I0 276}{.mmi; ld8 r18 = [up], UPD C M1 277 lfetch [r11], PUPD 278 BSH r25 = r19, tnc C I1 279 ;; } 280.LL11: 281 {.mmi; st8 [rp] = r15, UPD 282 or r14 = r21, r20 283 FSH r26 = r19, cnt 284}{.mmi; ld8 r19 = [up], UPD 285 nop.m 0 286 BSH r27 = r16, tnc 287 ;; } 288.LL10: 289 {.mmi; st8 [rp] = r14, UPD 290 or r15 = r23, r22 291 FSH r20 = r16, cnt 292}{.mmi; ld8 r16 = [up], UPD 293 nop.m 0 294 BSH r21 = r17, tnc 295 ;; } 296.LL01: 297 {.mmi; st8 [rp] = r15, UPD 298 or r14 = r25, r24 299 FSH r22 = r17, cnt 300}{.mib; ld8 r17 = [up], UPD 301 BSH r23 = r18, tnc 302 br.cloop.dptk .Ltop 303 ;; } 304C *** MAIN LOOP END *** 305 306.Lbot: 307 {.mmi; st8 [rp] = r14, UPD 308 or r15 = r27, r26 309 FSH r24 = r18, cnt 310}{.mib; nop 0 311 BSH r25 = r19, tnc 312 nop 0 313 ;; } 314.Lr7: 315 {.mmi; st8 [rp] = r15, UPD 316 or r14 = r21, r20 317 FSH r26 = r19, cnt 318}{.mib; nop 0 319 BSH r27 = r16, tnc 320 nop 0 321 ;; } 322.Lr6: 323 {.mmi; st8 [rp] = r14, UPD 324 or r15 = r23, r22 325 FSH r20 = r16, cnt 326}{.mib; nop 0 327 BSH r21 = r17, tnc 328 nop 0 329 ;; } 330.Lr5: st8 [rp] = r15, UPD 331 or r14 = r25, r24 332 FSH r22 = r17, cnt 333 ;; 334.Lr4: st8 [rp] = r14, UPD 335 or r15 = r27, r26 336 ;; 337.Lr3: st8 [rp] = r15, UPD 338 or r14 = r21, r20 339 ;; 340.Lr2: st8 [rp] = r14, UPD 341 ;; 342.Lr1: st8 [rp] = r22, UPD C M23 343 mov ar.lc = r2 C I0 344 br.ret.sptk.many b0 C B 345EPILOGUE(func) 346ASM_END() 347