aors_n.asm revision 1.1.1.2
1dnl IA-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction. 2 3dnl Contributed to the GNU project by Torbjorn Granlund. 4 5dnl Copyright 2003, 2004, 2005, 2010, 2011 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C cycles/limb 25C Itanium: 2.67 26C Itanium 2: 1.25 27 28C TODO 29C * Consider using special code for small n, using something like 30C "switch (8 * (n >= 8) + (n mod 8))" to enter it and feed-in code. 31C * The non-nc code was trimmed cycle for cycle to its current state. It is 32C probably hard to save more that an odd cycle there. The nc code is much 33C rawer (since tune/speed doesn't have any applicable direct measurements). 34C * Without the nc entry points, this becomes around 1800 bytes of object 35C code; the nc code adds over 1000 bytes. We should perhaps sacrifice a 36C few cycles for the non-nc code and let it fall into the nc code. 37 38C INPUT PARAMETERS 39define(`rp', `r32') 40define(`up', `r33') 41define(`vp', `r34') 42define(`n', `r35') 43define(`cy', `r36') 44 45ifdef(`OPERATION_add_n',` 46 define(ADDSUB, add) 47 define(CND, ltu) 48 define(INCR, 1) 49 define(LIM, -1) 50 define(LIM2, 0) 51 define(func, mpn_add_n) 52 define(func_nc, mpn_add_nc) 53') 54ifdef(`OPERATION_sub_n',` 55 define(ADDSUB, sub) 56 define(CND, gtu) 57 define(INCR, -1) 58 define(LIM, 0) 59 define(LIM2, -1) 60 define(func, mpn_sub_n) 61 define(func_nc, mpn_sub_nc) 62') 63 64define(cmpeqor, `cmp.eq.or') 65define(PFDIST, 500) 66 67C Some useful aliases for registers we use 68define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17') 69define(`v0',`r24') define(`v1',`r25') define(`v2',`r26') define(`v3',`r27') 70define(`w0',`r28') define(`w1',`r29') define(`w2',`r30') define(`w3',`r31') 71define(`rpx',`r3') 72define(`upadv',`r20') define(`vpadv',`r21') 73 74MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) 75 76ASM_START() 77PROLOGUE(func_nc) 78 .prologue 79 .save ar.lc, r2 80 .body 81ifdef(`HAVE_ABI_32',` 82 addp4 rp = 0, rp C M I 83 addp4 up = 0, up C M I 84 addp4 vp = 0, vp C M I 85 zxt4 n = n C I 86 ;; 87') 88 89 {.mmi; ld8 r11 = [vp], 8 C M01 90 ld8 r10 = [up], 8 C M01 91 mov r2 = ar.lc C I0 92}{.mmi; and r14 = 7, n C M I 93 cmp.lt p15, p14 = 8, n C M I 94 add n = -6, n C M I 95 ;; 96} 97.mmi; add upadv = PFDIST, up C Merging these lines into the feed-in 98 add vpadv = PFDIST, vp C code could save a cycle per call at 99 mov r23 = cy C the expense of code size. 100 ;; 101{.mmi; cmp.eq p6, p0 = 1, r14 C M I 102 cmp.eq p7, p0 = 2, r14 C M I 103 cmp.eq p8, p0 = 3, r14 C M I 104}{.bbb 105 (p6) br.dptk .Lc001 C B 106 (p7) br.dptk .Lc010 C B 107 (p8) br.dptk .Lc011 C B 108 ;; 109} 110{.mmi; cmp.eq p9, p0 = 4, r14 C M I 111 cmp.eq p10, p0 = 5, r14 C M I 112 cmp.eq p11, p0 = 6, r14 C M I 113}{.bbb 114 (p9) br.dptk .Lc100 C B 115 (p10) br.dptk .Lc101 C B 116 (p11) br.dptk .Lc110 C B 117 ;; 118}{.mmi; ld8 r19 = [vp], 8 C M01 119 ld8 r18 = [up], 8 C M01 120 cmp.ne p13, p0 = 0, cy C copy cy to p13 M I 121}{.mmb; cmp.eq p12, p0 = 7, r14 C M I 122 nop 0 123 (p12) br.dptk .Lc111 C B 124 ;; 125} 126 127.Lc000: 128.mmi; ld8 v3 = [vp], 8 C M01 129 ld8 u3 = [up], 8 C M01 130 shr.u n = n, 3 C I0 131 ;; 132.mmi; add vpadv = PFDIST, vp C M I 133 ld8 v0 = [vp], 8 C M01 134 mov ar.lc = n C I0 135.mmi; ld8 u0 = [up], 8 C M01 136 ADDSUB w1 = r10, r11 C M I 137 nop 0 138 ;; 139.mmi; add upadv = PFDIST, up C M I 140 ld8 v1 = [vp], 8 C M01 141 cmp.CND p7, p0 = w1, r10 C M I 142.mmi; ld8 u1 = [up], 8 C M01 143 ADDSUB w2 = r18, r19 C M I 144 add rpx = 8, rp C M I 145 ;; 146.mmi; ld8 v2 = [vp], 8 C M01 147 cmp.CND p8, p0 = w2, r18 C M I 148 (p13) cmpeqor p7, p0 = LIM, w1 C M I 149.mmi; ld8 u2 = [up], 8 C M01 150 (p13) add w1 = INCR, w1 C M I 151 ADDSUB w3 = u3, v3 C M I 152 ;; 153.mmi; ld8 v3 = [vp], 8 C M01 154 cmp.CND p9, p0 = w3, u3 C M I 155 (p7) cmpeqor p8, p0 = LIM, w2 C M I 156.mmb; ld8 u3 = [up], 8 C M01 157 (p7) add w2 = INCR, w2 C M I 158 br L(m0) 159 160 161.Lc001: 162.mmi; 163 (p15) ld8 v1 = [vp], 8 C M01 164 (p15) ld8 u1 = [up], 8 C M01 165 ADDSUB w0 = r10, r11 C M I 166.mmb; nop 0 167 nop 0 168 (p15) br 1f 169 ;; 170.mmi; cmp.ne p9, p0 = 0, r23 C M I 171 mov r8 = 0 172 cmp.CND p6, p0 = w0, r10 C M I 173 ;; 174.mmb; 175 (p9) cmpeqor p6, p0 = LIM, w0 C M I 176 (p9) add w0 = INCR, w0 C M I 177 br L(cj1) C B 1781: 179.mmi; ld8 v2 = [vp], 8 C M01 180 ld8 u2 = [up], 8 C M01 181 shr.u n = n, 3 C I0 182 ;; 183.mmi; ld8 v3 = [vp], 8 C M01 184 ld8 u3 = [up], 8 C M01 185 mov ar.lc = n C I0 186.mmi; nop 0 187 cmp.ne p9, p0 = 0, r23 C M I 188 nop 0 189 ;; 190.mmi; ld8 v0 = [vp], 8 C M01 191 cmp.CND p6, p0 = w0, r10 C M I 192 add rpx = 16, rp C M I 193.mmb; ld8 u0 = [up], 8 C M01 194 ADDSUB w1 = u1, v1 C M I 195 br L(c1) C B 196 197 198.Lc010: 199.mmi; ld8 v0 = [vp], 8 C M01 200 ld8 u0 = [up], 8 C M01 201 mov r8 = 0 C M I 202.mmb; ADDSUB w3 = r10, r11 C M I 203 cmp.ne p8, p0 = 0, r23 C M I 204 (p15) br 1f C B 205 ;; 206.mmi; cmp.CND p9, p0 = w3, r10 C M I 207 ADDSUB w0 = u0, v0 C M I 208 (p8) add w3 = INCR, w3 C M I 209 ;; 210.mmb; cmp.CND p6, p0 = w0, u0 C M I 211 (p8) cmpeqor p9, p0 = LIM2, w3 C M I 212 br L(cj2) C B 2131: 214.mmi; ld8 v1 = [vp], 8 C M01 215 ld8 u1 = [up], 8 C M01 216 shr.u n = n, 3 C I0 217 ;; 218.mmi; ld8 v2 = [vp], 8 C M01 219 ld8 u2 = [up], 8 C M01 220 mov ar.lc = n C I0 221 ;; 222.mmi; ld8 v3 = [vp], 8 C M01 223 ld8 u3 = [up], 8 C M01 224 cmp.CND p9, p0 = w3, r10 C M I 225 ;; 226.mmi; 227 (p8) cmpeqor p9, p0 = LIM, w3 C M I 228 (p8) add w3 = INCR, w3 C M I 229 ADDSUB w0 = u0, v0 C M I 230.mmb; add rpx = 24, rp C M I 231 nop 0 232 br L(m23) C B 233 234 235.Lc011: 236.mmi; ld8 v3 = [vp], 8 C M01 237 ld8 u3 = [up], 8 C M01 238 shr.u n = n, 3 C I0 239.mmi; ADDSUB w2 = r10, r11 C M I 240 cmp.ne p7, p0 = 0, r23 C M I 241 nop 0 242 ;; 243.mmb; ld8 v0 = [vp], 8 C M01 244 ld8 u0 = [up], 8 C M01 245 (p15) br 1f C B 246.mmi; cmp.CND p8, p0 = w2, r10 C M I 247 ADDSUB w3 = u3, v3 C M I 248 nop 0 249 ;; 250.mmb; 251 (p7) cmpeqor p8, p0 = LIM, w2 C M I 252 (p7) add w2 = INCR, w2 C M I 253 br L(cj3) C B 2541: 255.mmi; ld8 v1 = [vp], 8 C M01 256 ld8 u1 = [up], 8 C M01 257 ADDSUB w3 = u3, v3 C M I 258 ;; 259.mmi; ld8 v2 = [vp], 8 C M01 260 ld8 u2 = [up], 8 C M01 261 cmp.CND p8, p0 = w2, r10 C M I 262 ;; 263.mmi; ld8 v3 = [vp], 8 C M01 264 cmp.CND p9, p0 = w3, u3 C M I 265 mov ar.lc = n C I0 266.mmi; ld8 u3 = [up], 8 C M01 267 (p7) cmpeqor p8, p0 = LIM, w2 C M I 268 (p7) add w2 = INCR, w2 C M I 269 ;; 270.mmi; add rpx = 32, rp C M I 271 st8 [rp] = w2, 8 C M23 272 (p8) cmpeqor p9, p0 = LIM, w3 C M I 273.mmb; 274 (p8) add w3 = INCR, w3 C M I 275 ADDSUB w0 = u0, v0 C M I 276 br L(m23) 277 278 279.Lc100: 280.mmi; ld8 v2 = [vp], 8 C M01 281 ld8 u2 = [up], 8 C M01 282 shr.u n = n, 3 C I0 283.mmi; ADDSUB w1 = r10, r11 C M I 284 nop 0 285 nop 0 286 ;; 287.mmi; ld8 v3 = [vp], 8 C M01 288 ld8 u3 = [up], 8 C M01 289 add rpx = 8, rp C M I 290.mmi; cmp.ne p6, p0 = 0, r23 C M I 291 cmp.CND p7, p0 = w1, r10 C M I 292 nop 0 293 ;; 294.mmi; ld8 v0 = [vp], 8 C M01 295 ld8 u0 = [up], 8 C M01 296 ADDSUB w2 = u2, v2 C M I 297.mmb; 298 (p6) cmpeqor p7, p0 = LIM, w1 C M I 299 (p6) add w1 = INCR, w1 C M I 300 (p14) br L(cj4) 301 ;; 302.mmi; ld8 v1 = [vp], 8 C M01 303 ld8 u1 = [up], 8 C M01 304 mov ar.lc = n C I0 305 ;; 306.mmi; ld8 v2 = [vp], 8 C M01 307 cmp.CND p8, p0 = w2, u2 C M I 308 nop 0 309.mmi; ld8 u2 = [up], 8 C M01 310 nop 0 311 ADDSUB w3 = u3, v3 C M I 312 ;; 313.mmi; ld8 v3 = [vp], 8 C M01 314 cmp.CND p9, p0 = w3, u3 C M I 315 (p7) cmpeqor p8, p0 = LIM, w2 C M I 316.mmb; ld8 u3 = [up], 8 C M01 317 (p7) add w2 = INCR, w2 C M I 318 br L(m4) 319 320 321.Lc101: 322.mmi; ld8 v1 = [vp], 8 C M01 323 ld8 u1 = [up], 8 C M01 324 shr.u n = n, 3 C I0 325 ;; 326.mmi; ld8 v2 = [vp], 8 C M01 327 ld8 u2 = [up], 8 C M01 328 mov ar.lc = n C I0 329 ;; 330.mmi; ld8 v3 = [vp], 8 C M01 331 ld8 u3 = [up], 8 C M01 332 ADDSUB w0 = r10, r11 C M I 333.mmi; cmp.ne p9, p0 = 0, r23 C M I 334 add rpx = 16, rp C M I 335 nop 0 336 ;; 337.mmi; ld8 v0 = [vp], 8 C M01 338 cmp.CND p6, p0 = w0, r10 C M I 339 ld8 u0 = [up], 8 C M01 340.mbb; ADDSUB w1 = u1, v1 C M I 341 (p15) br L(c5) C B 342 br L(end) C B 343 344 345.Lc110: 346.mmi; ld8 v0 = [vp], 8 C M01 347 ld8 u0 = [up], 8 C M01 348 shr.u n = n, 3 C I0 349 ;; 350.mmi; add upadv = PFDIST, up C M I 351 add vpadv = PFDIST, vp C M I 352 mov ar.lc = n C I0 353.mmi; ld8 v1 = [vp], 8 C M01 354 ld8 u1 = [up], 8 C M01 355 ADDSUB w3 = r10, r11 C M I 356 ;; 357.mmi; ld8 v2 = [vp], 8 C M01 358 ld8 u2 = [up], 8 C M01 359 ADDSUB w0 = u0, v0 C M I 360.mmi; cmp.CND p9, p0 = w3, r10 C M I 361 cmp.ne p8, p0 = 0, r23 C M I 362 add rpx = 24, rp C M I 363 ;; 364.mmi; ld8 v3 = [vp], 8 C M01 365 ld8 u3 = [up], 8 C M01 366 nop 0 367.mmb; 368 (p8) cmpeqor p9, p0 = LIM, w3 C M I 369 (p8) add w3 = INCR, w3 C M I 370 br L(m67) C B 371 372 373.Lc111: 374.mmi; ld8 v0 = [vp], 8 C M01 375 ld8 u0 = [up], 8 C M01 376 shr.u n = n, 3 C I0 377 ;; 378.mmi; add upadv = PFDIST, up C M I 379 ld8 v1 = [vp], 8 C M01 380 mov ar.lc = n C I0 381.mmi; ld8 u1 = [up], 8 C M01 382 ADDSUB w2 = r10, r11 C M I 383 nop 0 384 ;; 385.mmi; add vpadv = PFDIST, vp C M I 386 ld8 v2 = [vp], 8 C M01 387 cmp.CND p8, p0 = w2, r10 C M I 388.mmi; ld8 u2 = [up], 8 C M01 389 ADDSUB w3 = r18, r19 C M I 390 nop 0 391 ;; 392.mmi; ld8 v3 = [vp], 8 C M01 393 cmp.CND p9, p0 = w3, r18 C M I 394 (p13) cmpeqor p8, p0 = LIM, w2 C M I 395.mmi; ld8 u3 = [up], 8 C M01 396 (p13) add w2 = INCR, w2 C M I 397 nop 0 398 ;; 399.mmi; add rpx = 32, rp C M I 400 st8 [rp] = w2, 8 C M23 401 (p8) cmpeqor p9, p0 = LIM, w3 C M I 402.mmb; 403 (p8) add w3 = INCR, w3 C M I 404 ADDSUB w0 = u0, v0 C M I 405 br L(m67) 406 407EPILOGUE() 408 409ASM_START() 410PROLOGUE(func) 411 .prologue 412 .save ar.lc, r2 413 .body 414ifdef(`HAVE_ABI_32',` 415 addp4 rp = 0, rp C M I 416 addp4 up = 0, up C M I 417 addp4 vp = 0, vp C M I 418 zxt4 n = n C I 419 ;; 420') 421 422 {.mmi; ld8 r11 = [vp], 8 C M01 423 ld8 r10 = [up], 8 C M01 424 mov r2 = ar.lc C I0 425}{.mmi; and r14 = 7, n C M I 426 cmp.lt p15, p14 = 8, n C M I 427 add n = -6, n C M I 428 ;; 429}{.mmi; cmp.eq p6, p0 = 1, r14 C M I 430 cmp.eq p7, p0 = 2, r14 C M I 431 cmp.eq p8, p0 = 3, r14 C M I 432}{.bbb 433 (p6) br.dptk .Lb001 C B 434 (p7) br.dptk .Lb010 C B 435 (p8) br.dptk .Lb011 C B 436 ;; 437}{.mmi; cmp.eq p9, p0 = 4, r14 C M I 438 cmp.eq p10, p0 = 5, r14 C M I 439 cmp.eq p11, p0 = 6, r14 C M I 440}{.bbb 441 (p9) br.dptk .Lb100 C B 442 (p10) br.dptk .Lb101 C B 443 (p11) br.dptk .Lb110 C B 444 ;; 445}{.mmi; ld8 r19 = [vp], 8 C M01 446 ld8 r18 = [up], 8 C M01 447 cmp.ne p13, p0 = r0, r0 C clear "CF" M I 448}{.mmb; cmp.eq p12, p0 = 7, r14 C M I 449 mov r23 = 0 C M I 450 (p12) br.dptk .Lb111 C B 451 ;; 452} 453 454.Lb000: 455.mmi; ld8 v3 = [vp], 8 C M01 456 ld8 u3 = [up], 8 C M01 457 shr.u n = n, 3 C I0 458 ;; 459.mmi; ld8 v0 = [vp], 8 C M01 460 ld8 u0 = [up], 8 C M01 461 ADDSUB w1 = r10, r11 C M I 462 ;; 463.mmi; ld8 v1 = [vp], 8 C M01 464 cmp.CND p7, p0 = w1, r10 C M I 465 mov ar.lc = n C I0 466.mmi; ld8 u1 = [up], 8 C M01 467 ADDSUB w2 = r18, r19 C M I 468 add rpx = 8, rp C M I 469 ;; 470.mmi; add upadv = PFDIST, up 471 add vpadv = PFDIST, vp 472 cmp.CND p8, p0 = w2, r18 C M I 473.mmi; ld8 v2 = [vp], 8 C M01 474 ld8 u2 = [up], 8 C M01 475 ADDSUB w3 = u3, v3 C M I 476 ;; 477.mmi; ld8 v3 = [vp], 8 C M01 478 cmp.CND p9, p0 = w3, u3 C M I 479 (p7) cmpeqor p8, p0 = LIM, w2 C M I 480.mmb; ld8 u3 = [up], 8 C M01 481 (p7) add w2 = INCR, w2 C M I 482 br L(m0) C B 483 484 485 ALIGN(32) 486.Lb001: 487.mmi; ADDSUB w0 = r10, r11 C M I 488 (p15) ld8 v1 = [vp], 8 C M01 489 mov r8 = 0 C M I 490 ;; 491.mmb; cmp.CND p6, p0 = w0, r10 C M I 492 (p15) ld8 u1 = [up], 8 C M01 493 (p14) br L(cj1) C B 494 ;; 495.mmi; add upadv = PFDIST, up 496 add vpadv = PFDIST, vp 497 shr.u n = n, 3 C I0 498.mmi; ld8 v2 = [vp], 8 C M01 499 ld8 u2 = [up], 8 C M01 500 cmp.CND p6, p0 = w0, r10 C M I 501 ;; 502.mmi; ld8 v3 = [vp], 8 C M01 503 ld8 u3 = [up], 8 C M01 504 mov ar.lc = n C I0 505 ;; 506.mmi; ld8 v0 = [vp], 8 C M01 507 ld8 u0 = [up], 8 C M01 508 ADDSUB w1 = u1, v1 C M I 509 ;; 510.mmi; ld8 v1 = [vp], 8 C M01 511 cmp.CND p7, p0 = w1, u1 C M I 512 ADDSUB w2 = u2, v2 C M I 513.mmb; ld8 u1 = [up], 8 C M01 514 add rpx = 16, rp C M I 515 br L(m1) C B 516 517 518 ALIGN(32) 519.Lb010: 520.mmi; ld8 v0 = [vp], 8 C M01 521 ld8 u0 = [up], 8 C M01 522 shr.u n = n, 3 C I0 523.mmb; ADDSUB w3 = r10, r11 C M I 524 nop 0 525 (p15) br L(gt2) C B 526 ;; 527.mmi; cmp.CND p9, p0 = w3, r10 C M I 528 ADDSUB w0 = u0, v0 C M I 529 mov r8 = 0 C M I 530 ;; 531.mmb; nop 0 532 cmp.CND p6, p0 = w0, u0 C M I 533 br L(cj2) C B 534L(gt2): 535.mmi; ld8 v1 = [vp], 8 C M01 536 ld8 u1 = [up], 8 C M01 537 nop 0 538 ;; 539.mmi; add upadv = PFDIST, up 540 add vpadv = PFDIST, vp 541 mov ar.lc = n C I0 542.mmi; ld8 v2 = [vp], 8 C M01 543 ld8 u2 = [up], 8 C M01 544 nop 0 545 ;; 546.mmi; ld8 v3 = [vp], 8 C M01 547 cmp.CND p9, p0 = w3, r10 C M I 548 ADDSUB w0 = u0, v0 C M I 549.mmb; ld8 u3 = [up], 8 C M01 550 add rpx = 24, rp C M I 551 br L(m23) C B 552 553 554 ALIGN(32) 555.Lb011: 556.mmi; ld8 v3 = [vp], 8 C M01 557 ld8 u3 = [up], 8 C M01 558 ADDSUB w2 = r10, r11 C M I 559 ;; 560.mmb; ld8 v0 = [vp], 8 C M01 561 ld8 u0 = [up], 8 C M01 562 (p15) br 1f C B 563.mmb; cmp.CND p8, p0 = w2, r10 C M I 564 ADDSUB w3 = u3, v3 C M I 565 br L(cj3) C B 5661: 567.mmi; ld8 v1 = [vp], 8 C M01 568 ld8 u1 = [up], 8 C M01 569 shr.u n = n, 3 C I0 570 ;; 571.mmi; add upadv = PFDIST, up 572 add vpadv = PFDIST, vp 573 ADDSUB w3 = u3, v3 C M I 574.mmi; ld8 v2 = [vp], 8 C M01 575 ld8 u2 = [up], 8 C M01 576 cmp.CND p8, p0 = w2, r10 C M I 577 ;; 578.mmi; ld8 v3 = [vp], 8 C M01 579 cmp.CND p9, p0 = w3, u3 C M I 580 mov ar.lc = n C I0 581.mmi; ld8 u3 = [up], 8 C M01 582 nop 0 583 nop 0 584 ;; 585.mmi; add rpx = 32, rp C M I 586 st8 [rp] = w2, 8 C M23 587 (p8) cmpeqor p9, p0 = LIM, w3 C M I 588.mmb; 589 (p8) add w3 = INCR, w3 C M I 590 ADDSUB w0 = u0, v0 C M I 591 br L(m23) C B 592 593 594 ALIGN(32) 595.Lb100: 596.mmi; ld8 v2 = [vp], 8 C M01 597 ld8 u2 = [up], 8 C M01 598 shr.u n = n, 3 C I0 599 ;; 600.mmi; ld8 v3 = [vp], 8 C M01 601 ld8 u3 = [up], 8 C M01 602 ADDSUB w1 = r10, r11 C M I 603 ;; 604.mmi; ld8 v0 = [vp], 8 C M01 605 ld8 u0 = [up], 8 C M01 606 cmp.CND p7, p0 = w1, r10 C M I 607.mmb; nop 0 608 ADDSUB w2 = u2, v2 C M I 609 (p14) br L(cj4) C B 610 ;; 611L(gt4): 612.mmi; add upadv = PFDIST, up 613 add vpadv = PFDIST, vp 614 mov ar.lc = n C I0 615 ld8 v1 = [vp], 8 C M01 616 ld8 u1 = [up], 8 C M01 617 nop 0 618 ;; 619.mmi; ld8 v2 = [vp], 8 C M01 620 cmp.CND p8, p0 = w2, u2 C M I 621 nop 0 622.mmi; ld8 u2 = [up], 8 C M01 623 ADDSUB w3 = u3, v3 C M I 624 add rpx = 8, rp C M I 625 ;; 626.mmi; ld8 v3 = [vp], 8 C M01 627 cmp.CND p9, p0 = w3, u3 C M I 628 (p7) cmpeqor p8, p0 = LIM, w2 C M I 629.mmb; ld8 u3 = [up], 8 C M01 630 (p7) add w2 = INCR, w2 C M I 631 br L(m4) C B 632 633 634 ALIGN(32) 635.Lb101: 636.mmi; ld8 v1 = [vp], 8 C M01 637 ld8 u1 = [up], 8 C M01 638 shr.u n = n, 3 C I0 639 ;; 640.mmi; ld8 v2 = [vp], 8 C M01 641 ld8 u2 = [up], 8 C M01 642 ADDSUB w0 = r10, r11 C M I 643 ;; 644.mmi; add upadv = PFDIST, up 645 add vpadv = PFDIST, vp 646 add rpx = 16, rp C M I 647 ld8 v3 = [vp], 8 C M01 648 ld8 u3 = [up], 8 C M01 649 nop 0 650 ;; 651.mmi; ld8 v0 = [vp], 8 C M01 652 cmp.CND p6, p0 = w0, r10 C M I 653 nop 0 654.mmb; ld8 u0 = [up], 8 C M01 655 ADDSUB w1 = u1, v1 C M I 656 (p14) br L(cj5) C B 657 ;; 658L(gt5): 659.mmi; ld8 v1 = [vp], 8 C M01 660 cmp.CND p7, p0 = w1, u1 C M I 661 mov ar.lc = n C I0 662.mmb; ld8 u1 = [up], 8 C M01 663 ADDSUB w2 = u2, v2 C M I 664 br L(m5) C B 665 666 667 ALIGN(32) 668.Lb110: 669.mmi; ld8 v0 = [vp], 8 C M01 670 ld8 u0 = [up], 8 C M01 671 shr.u n = n, 3 C I0 672 ;; 673.mmi; ld8 v1 = [vp], 8 C M01 674 ld8 u1 = [up], 8 C M01 675 ADDSUB w3 = r10, r11 C M I 676 ;; 677.mmi; add upadv = PFDIST, up 678 add vpadv = PFDIST, vp 679 mov ar.lc = n C I0 680.mmi; ld8 v2 = [vp], 8 C M01 681 ld8 u2 = [up], 8 C M01 682 nop 0 683 ;; 684.mmi; ld8 v3 = [vp], 8 C M01 685 cmp.CND p9, p0 = w3, r10 C M I 686 ADDSUB w0 = u0, v0 C M I 687.mmb; ld8 u3 = [up], 8 C M01 688 add rpx = 24, rp C M I 689 br L(m67) C B 690 691 692 ALIGN(32) 693.Lb111: 694.mmi; ld8 v0 = [vp], 8 C M01 695 ld8 u0 = [up], 8 C M01 696 shr.u n = n, 3 C I0 697 ;; 698.mmi; ld8 v1 = [vp], 8 C M01 699 ld8 u1 = [up], 8 C M01 700 ADDSUB w2 = r10, r11 C M I 701 ;; 702.mmi; ld8 v2 = [vp], 8 C M01 703 cmp.CND p8, p0 = w2, r10 C M I 704 mov ar.lc = n C I0 705.mmi; ld8 u2 = [up], 8 C M01 706 ADDSUB w3 = r18, r19 C M I 707 nop 0 708 ;; 709.mmi; add upadv = PFDIST, up 710 add vpadv = PFDIST, vp 711 nop 0 712.mmi; ld8 v3 = [vp], 8 C M01 713 ld8 u3 = [up], 8 C M01 714 cmp.CND p9, p0 = w3, r18 C M I 715 ;; 716.mmi; add rpx = 32, rp C M I 717 st8 [rp] = w2, 8 C M23 718 (p8) cmpeqor p9, p0 = LIM, w3 C M I 719.mmb; 720 (p8) add w3 = INCR, w3 C M I 721 ADDSUB w0 = u0, v0 C M I 722 br L(m67) C B 723 724 725C *** MAIN LOOP START *** 726 ALIGN(32) 727L(top): 728L(c5): ld8 v1 = [vp], 8 C M01 729 cmp.CND p7, p0 = w1, u1 C M I 730 (p9) cmpeqor p6, p0 = LIM, w0 C M I 731 ld8 u1 = [up], 8 C M01 732 (p9) add w0 = INCR, w0 C M I 733 ADDSUB w2 = u2, v2 C M I 734 ;; 735L(m5): ld8 v2 = [vp], 8 C M01 736 cmp.CND p8, p0 = w2, u2 C M I 737 (p6) cmpeqor p7, p0 = LIM, w1 C M I 738 ld8 u2 = [up], 8 C M01 739 (p6) add w1 = INCR, w1 C M I 740 ADDSUB w3 = u3, v3 C M I 741 ;; 742 st8 [rp] = w0, 8 C M23 743 ld8 v3 = [vp], 8 C M01 744 cmp.CND p9, p0 = w3, u3 C M I 745 (p7) cmpeqor p8, p0 = LIM, w2 C M I 746 ld8 u3 = [up], 8 C M01 747 (p7) add w2 = INCR, w2 C M I 748 ;; 749L(m4): st8 [rp] = w1, 16 C M23 750 st8 [rpx] = w2, 32 C M23 751 (p8) cmpeqor p9, p0 = LIM, w3 C M I 752 lfetch [upadv], 64 753 (p8) add w3 = INCR, w3 C M I 754 ADDSUB w0 = u0, v0 C M I 755 ;; 756L(m23): st8 [rp] = w3, 8 C M23 757 ld8 v0 = [vp], 8 C M01 758 cmp.CND p6, p0 = w0, u0 C M I 759 ld8 u0 = [up], 8 C M01 760 ADDSUB w1 = u1, v1 C M I 761 nop.b 0 762 ;; 763L(c1): ld8 v1 = [vp], 8 C M01 764 cmp.CND p7, p0 = w1, u1 C M I 765 (p9) cmpeqor p6, p0 = LIM, w0 C M I 766 ld8 u1 = [up], 8 C M01 767 (p9) add w0 = INCR, w0 C M I 768 ADDSUB w2 = u2, v2 C M I 769 ;; 770L(m1): ld8 v2 = [vp], 8 C M01 771 cmp.CND p8, p0 = w2, u2 C M I 772 (p6) cmpeqor p7, p0 = LIM, w1 C M I 773 ld8 u2 = [up], 8 C M01 774 (p6) add w1 = INCR, w1 C M I 775 ADDSUB w3 = u3, v3 C M I 776 ;; 777 st8 [rp] = w0, 8 C M23 778 ld8 v3 = [vp], 8 C M01 779 cmp.CND p9, p0 = w3, u3 C M I 780 (p7) cmpeqor p8, p0 = LIM, w2 C M I 781 ld8 u3 = [up], 8 C M01 782 (p7) add w2 = INCR, w2 C M I 783 ;; 784L(m0): st8 [rp] = w1, 16 C M23 785 st8 [rpx] = w2, 32 C M23 786 (p8) cmpeqor p9, p0 = LIM, w3 C M I 787 lfetch [vpadv], 64 788 (p8) add w3 = INCR, w3 C M I 789 ADDSUB w0 = u0, v0 C M I 790 ;; 791L(m67): st8 [rp] = w3, 8 C M23 792 ld8 v0 = [vp], 8 C M01 793 cmp.CND p6, p0 = w0, u0 C M I 794 ld8 u0 = [up], 8 C M01 795 ADDSUB w1 = u1, v1 C M I 796 br.cloop.dptk L(top) C B 797 ;; 798C *** MAIN LOOP END *** 799 800L(end): 801.mmi; 802 (p9) cmpeqor p6, p0 = LIM, w0 C M I 803 (p9) add w0 = INCR, w0 C M I 804 mov ar.lc = r2 C I0 805L(cj5): 806.mmi; cmp.CND p7, p0 = w1, u1 C M I 807 ADDSUB w2 = u2, v2 C M I 808 nop 0 809 ;; 810.mmi; st8 [rp] = w0, 8 C M23 811 (p6) cmpeqor p7, p0 = LIM, w1 C M I 812 (p6) add w1 = INCR, w1 C M I 813L(cj4): 814.mmi; cmp.CND p8, p0 = w2, u2 C M I 815 ADDSUB w3 = u3, v3 C M I 816 nop 0 817 ;; 818.mmi; st8 [rp] = w1, 8 C M23 819 (p7) cmpeqor p8, p0 = LIM, w2 C M I 820 (p7) add w2 = INCR, w2 C M I 821L(cj3): 822.mmi; cmp.CND p9, p0 = w3, u3 C M I 823 ADDSUB w0 = u0, v0 C M I 824 nop 0 825 ;; 826.mmi; st8 [rp] = w2, 8 C M23 827 (p8) cmpeqor p9, p0 = LIM, w3 C M I 828 (p8) add w3 = INCR, w3 C M I 829.mmi; cmp.CND p6, p0 = w0, u0 C M I 830 nop 0 831 mov r8 = 0 C M I 832 ;; 833L(cj2): 834.mmi; st8 [rp] = w3, 8 C M23 835 (p9) cmpeqor p6, p0 = LIM, w0 C M I 836 (p9) add w0 = INCR, w0 C M I 837 ;; 838L(cj1): 839.mmb; st8 [rp] = w0, 8 C M23 840 (p6) mov r8 = 1 C M I 841 br.ret.sptk.many b0 C B 842EPILOGUE() 843ASM_END() 844