1dnl IA-64 mpn_addmul_2 -- Multiply a n-limb number with a 2-limb number and 2dnl add the result to a (n+1)-limb number. 3 4dnl Copyright 2004, 2005 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23C cycles/limb 24C Itanium: 3.65 25C Itanium 2: 1.625 26 27C Note that this is very similar to mul_2.asm. If you change this file, 28C please change that file too. 29 30C TODO 31C * Clean up variable names, and try to decrease the number of distinct 32C registers used. 33C * Cleanup feed-in code to not require zeroing several registers. 34C * Make sure we don't depend on uninitialized predicate registers. 35C * We currently cross-jump very aggressively, at the expense of a few cycles 36C per operation. Consider changing that. 37C * Could perhaps save a few cycles by using 1 c/l carry propagation in 38C wind-down code. 39C * Ultimately rewrite. The problem with this code is that it first uses a 40C loaded u value in one xma pair, then leaves it live over several unrelated 41C xma pairs, before it uses it again. It should actually be quite possible 42C to just swap some aligned xma pairs around. But we should then schedule 43C u loads further from the first use. 44 45C INPUT PARAMETERS 46define(`rp',`r32') 47define(`up',`r33') 48define(`n',`r34') 49define(`vp',`r35') 50 51define(`srp',`r3') 52 53define(`v0',`f6') 54define(`v1',`f7') 55 56define(`s0',`r14') 57define(`acc0',`r15') 58 59define(`pr0_0',`r16') define(`pr0_1',`r17') 60define(`pr0_2',`r18') define(`pr0_3',`r19') 61 62define(`pr1_0',`r20') define(`pr1_1',`r21') 63define(`pr1_2',`r22') define(`pr1_3',`r23') 64 65define(`acc1_0',`r24') define(`acc1_1',`r25') 66define(`acc1_2',`r26') define(`acc1_3',`r27') 67 68dnl define(`',`r28') 69dnl define(`',`r29') 70dnl define(`',`r30') 71dnl define(`',`r31') 72 73define(`fp0b_0',`f8') define(`fp0b_1',`f9') 74define(`fp0b_2',`f10') define(`fp0b_3',`f11') 75 76define(`fp1a_0',`f12') define(`fp1a_1',`f13') 77define(`fp1a_2',`f14') define(`fp1a_3',`f15') 78 79define(`fp1b_0',`f32') define(`fp1b_1',`f33') 80define(`fp1b_2',`f34') define(`fp1b_3',`f35') 81 82define(`fp2a_0',`f36') define(`fp2a_1',`f37') 83define(`fp2a_2',`f38') define(`fp2a_3',`f39') 84 85define(`r_0',`f40') define(`r_1',`f41') 86define(`r_2',`f42') define(`r_3',`f43') 87 88define(`u_0',`f44') define(`u_1',`f45') 89define(`u_2',`f46') define(`u_3',`f47') 90 91define(`rx',`f48') 92define(`ux',`f49') 93define(`ry',`f50') 94define(`uy',`f51') 95 96ASM_START() 97PROLOGUE(mpn_addmul_2) 98 .prologue 99 .save ar.lc, r2 100 .body 101 102ifdef(`HAVE_ABI_32', 103` addp4 rp = 0, rp C M I 104 addp4 up = 0, up C M I 105 addp4 vp = 0, vp C M I 106 zxt4 n = n C I 107 ;;') 108 109{.mmi C 00 110 ldf8 ux = [up], 8 C M 111 ldf8 v0 = [vp], 8 C M 112 mov.i r2 = ar.lc C I0 113}{.mmi 114 ldf8 rx = [rp], 8 C M 115 and r14 = 3, n C M I 116 add n = -2, n C M I 117 ;; 118}{.mmi C 01 119 ldf8 uy = [up], 8 C M 120 ldf8 v1 = [vp] C M 121 shr.u n = n, 2 C I0 122}{.mmi 123 ldf8 ry = [rp], -8 C M 124 cmp.eq p10, p0 = 1, r14 C M I 125 cmp.eq p11, p0 = 2, r14 C M I 126 ;; 127}{.mmi C 02 128 add srp = 16, rp C M I 129 cmp.eq p12, p0 = 3, r14 C M I 130 mov.i ar.lc = n C I0 131}{.bbb 132 (p10) br.dptk .Lb01 C B 133 (p11) br.dptk .Lb10 C B 134 (p12) br.dptk .Lb11 C B 135 ;; 136} 137 138 ALIGN(32) 139.Lb00: ldf8 r_1 = [srp], 8 140 ldf8 u_1 = [up], 8 141 mov acc1_2 = 0 142 mov pr1_2 = 0 143 mov pr0_3 = 0 144 cmp.ne p8, p9 = r0, r0 145 ;; 146 ldf8 r_2 = [srp], 8 147 xma.l fp0b_3 = ux, v0, rx 148 cmp.ne p12, p13 = r0, r0 149 ldf8 u_2 = [up], 8 150 xma.hu fp1a_3 = ux, v0, rx 151 br.cloop.dptk .grt4 152 153 xma.l fp0b_0 = uy, v0, ry 154 xma.hu fp1a_0 = uy, v0, ry 155 ;; 156 getf.sig acc0 = fp0b_3 157 xma.l fp1b_3 = ux, v1, fp1a_3 158 xma.hu fp2a_3 = ux, v1, fp1a_3 159 ;; 160 xma.l fp0b_1 = u_1, v0, r_1 161 xma.hu fp1a_1 = u_1, v0, r_1 162 ;; 163 getf.sig pr0_0 = fp0b_0 164 xma.l fp1b_0 = uy, v1, fp1a_0 165 xma.hu fp2a_0 = uy, v1, fp1a_0 166 ;; 167 getf.sig pr1_3 = fp1b_3 168 getf.sig acc1_3 = fp2a_3 169 xma.l fp0b_2 = u_2, v0, r_2 170 xma.hu fp1a_2 = u_2, v0, r_2 171 br .Lcj4 172 173.grt4: xma.l fp0b_0 = uy, v0, ry 174 xma.hu fp1a_0 = uy, v0, ry 175 ;; 176 ldf8 r_3 = [srp], 8 177 getf.sig acc0 = fp0b_3 178 xma.l fp1b_3 = ux, v1, fp1a_3 179 ldf8 u_3 = [up], 8 180 xma.hu fp2a_3 = ux, v1, fp1a_3 181 ;; 182 xma.l fp0b_1 = u_1, v0, r_1 183 xma.hu fp1a_1 = u_1, v0, r_1 184 ;; 185 ldf8 r_0 = [srp], 8 186 getf.sig pr0_0 = fp0b_0 187 xma.l fp1b_0 = uy, v1, fp1a_0 188 xma.hu fp2a_0 = uy, v1, fp1a_0 189 ;; 190 ldf8 u_0 = [up], 8 191 getf.sig pr1_3 = fp1b_3 192 ;; 193 getf.sig acc1_3 = fp2a_3 194 xma.l fp0b_2 = u_2, v0, r_2 195 xma.hu fp1a_2 = u_2, v0, r_2 196 br .LL00 197 198 199 ALIGN(32) 200.Lb01: ldf8 r_0 = [srp], 8 C M 201 ldf8 u_0 = [up], 8 C M 202 mov acc1_1 = 0 C M I 203 mov pr1_1 = 0 C M I 204 mov pr0_2 = 0 C M I 205 cmp.ne p6, p7 = r0, r0 C M I 206 ;; 207 ldf8 r_1 = [srp], 8 C M 208 xma.l fp0b_2 = ux, v0, rx C F 209 cmp.ne p10, p11 = r0, r0 C M I 210 ldf8 u_1 = [up], 8 C M 211 xma.hu fp1a_2 = ux, v0, rx C F 212 ;; 213 xma.l fp0b_3 = uy, v0, ry C F 214 xma.hu fp1a_3 = uy, v0, ry C F 215 ;; 216 getf.sig acc0 = fp0b_2 C M 217 ldf8 r_2 = [srp], 8 C M 218 xma.l fp1b_2 = ux, v1,fp1a_2 C F 219 xma.hu fp2a_2 = ux, v1,fp1a_2 C F 220 ldf8 u_2 = [up], 8 C M 221 br.cloop.dptk .grt5 222 223 xma.l fp0b_0 = u_0, v0, r_0 C F 224 xma.hu fp1a_0 = u_0, v0, r_0 C F 225 ;; 226 getf.sig pr0_3 = fp0b_3 C M 227 xma.l fp1b_3 = uy, v1,fp1a_3 C F 228 xma.hu fp2a_3 = uy, v1,fp1a_3 C F 229 ;; 230 getf.sig pr1_2 = fp1b_2 C M 231 getf.sig acc1_2 = fp2a_2 C M 232 xma.l fp0b_1 = u_1, v0, r_1 C F 233 xma.hu fp1a_1 = u_1, v0, r_1 C F 234 br .Lcj5 235 236.grt5: xma.l fp0b_0 = u_0, v0, r_0 237 xma.hu fp1a_0 = u_0, v0, r_0 238 ;; 239 getf.sig pr0_3 = fp0b_3 240 ldf8 r_3 = [srp], 8 241 xma.l fp1b_3 = uy, v1, fp1a_3 242 xma.hu fp2a_3 = uy, v1, fp1a_3 243 ;; 244 ldf8 u_3 = [up], 8 245 getf.sig pr1_2 = fp1b_2 246 ;; 247 getf.sig acc1_2 = fp2a_2 248 xma.l fp0b_1 = u_1, v0, r_1 249 xma.hu fp1a_1 = u_1, v0, r_1 250 br .LL01 251 252 253 ALIGN(32) 254.Lb10: C 03 255 br.cloop.dptk .grt2 256 C 04 257 C 05 258 C 06 259 xma.l fp0b_1 = ux, v0, rx 260 xma.hu fp1a_1 = ux, v0, rx 261 ;; C 07 262 xma.l fp0b_2 = uy, v0, ry 263 xma.hu fp1a_2 = uy, v0, ry 264 ;; C 08 265 C 09 266 C 10 267 stf8 [rp] = fp0b_1, 8 268 xma.l fp1b_1 = ux, v1, fp1a_1 269 xma.hu fp2a_1 = ux, v1, fp1a_1 270 ;; C 11 271 getf.sig acc0 = fp0b_2 272 xma.l fp1b_2 = uy, v1, fp1a_2 273 xma.hu fp2a_2 = uy, v1, fp1a_2 274 ;; C 12 275 C 13 276 C 14 277 getf.sig pr1_1 = fp1b_1 278 C 15 279 getf.sig acc1_1 = fp2a_1 280 C 16 281 getf.sig pr1_2 = fp1b_2 282 C 17 283 getf.sig r8 = fp2a_2 284 ;; C 18 285 C 19 286 add s0 = pr1_1, acc0 287 ;; C 20 288 st8 [rp] = s0, 8 289 cmp.ltu p8, p9 = s0, pr1_1 290 sub r31 = -1, acc1_1 291 ;; C 21 292 .pred.rel "mutex", p8, p9 293 (p8) add acc0 = pr1_2, acc1_1, 1 294 (p9) add acc0 = pr1_2, acc1_1 295 (p8) cmp.leu p10, p0 = r31, pr1_2 296 (p9) cmp.ltu p10, p0 = r31, pr1_2 297 ;; C 22 298 st8 [rp] = acc0, 8 299 mov.i ar.lc = r2 300 (p10) add r8 = 1, r8 301 br.ret.sptk.many b0 302 303 304.grt2: ldf8 r_3 = [srp], 8 305 ldf8 u_3 = [up], 8 306 mov acc1_0 = 0 307 ;; 308 ldf8 r_0 = [srp], 8 309 xma.l fp0b_1 = ux, v0, rx 310 mov pr1_0 = 0 311 ldf8 u_0 = [up], 8 312 xma.hu fp1a_1 = ux, v0, rx 313 mov pr0_1 = 0 314 ;; 315 xma.l fp0b_2 = uy, v0, ry 316 xma.hu fp1a_2 = uy, v0, ry 317 ;; 318 getf.sig acc0 = fp0b_1 319 ldf8 r_1 = [srp], 8 320 xma.l fp1b_1 = ux, v1, fp1a_1 321 xma.hu fp2a_1 = ux, v1, fp1a_1 322 ;; 323 ldf8 u_1 = [up], 8 324 xma.l fp0b_3 = u_3, v0, r_3 325 xma.hu fp1a_3 = u_3, v0, r_3 326 ;; 327 getf.sig pr0_2 = fp0b_2 328 ldf8 r_2 = [srp], 8 329 xma.l fp1b_2 = uy, v1, fp1a_2 330 xma.hu fp2a_2 = uy, v1, fp1a_2 331 ;; 332 ldf8 u_2 = [up], 8 333 getf.sig pr1_1 = fp1b_1 334 ;; 335 getf.sig acc1_1 = fp2a_1 336 xma.l fp0b_0 = u_0, v0, r_0 337 cmp.ne p8, p9 = r0, r0 338 cmp.ne p12, p13 = r0, r0 339 xma.hu fp1a_0 = u_0, v0, r_0 340 br .LL10 341 342 343 ALIGN(32) 344.Lb11: mov acc1_3 = 0 345 mov pr1_3 = 0 346 mov pr0_0 = 0 347 cmp.ne p6, p7 = r0, r0 348 ;; 349 ldf8 r_2 = [srp], 8 350 ldf8 u_2 = [up], 8 351 br.cloop.dptk .grt3 352 ;; 353 xma.l fp0b_0 = ux, v0, rx 354 xma.hu fp1a_0 = ux, v0, rx 355 ;; 356 cmp.ne p10, p11 = r0, r0 357 xma.l fp0b_1 = uy, v0, ry 358 xma.hu fp1a_1 = uy, v0, ry 359 ;; 360 getf.sig acc0 = fp0b_0 361 xma.l fp1b_0 = ux, v1, fp1a_0 362 xma.hu fp2a_0 = ux, v1, fp1a_0 363 ;; 364 xma.l fp0b_2 = u_2, v0, r_2 365 xma.hu fp1a_2 = u_2, v0, r_2 366 ;; 367 getf.sig pr0_1 = fp0b_1 368 xma.l fp1b_1 = uy, v1, fp1a_1 369 xma.hu fp2a_1 = uy, v1, fp1a_1 370 ;; 371 getf.sig pr1_0 = fp1b_0 372 getf.sig acc1_0 = fp2a_0 373 br .Lcj3 374 375.grt3: ldf8 r_3 = [srp], 8 376 xma.l fp0b_0 = ux, v0, rx 377 cmp.ne p10, p11 = r0, r0 378 ldf8 u_3 = [up], 8 379 xma.hu fp1a_0 = ux, v0, rx 380 ;; 381 xma.l fp0b_1 = uy, v0, ry 382 xma.hu fp1a_1 = uy, v0, ry 383 ;; 384 getf.sig acc0 = fp0b_0 385 ldf8 r_0 = [srp], 8 386 xma.l fp1b_0 = ux, v1, fp1a_0 387 ldf8 u_0 = [up], 8 388 xma.hu fp2a_0 = ux, v1, fp1a_0 389 ;; 390 xma.l fp0b_2 = u_2, v0, r_2 391 xma.hu fp1a_2 = u_2, v0, r_2 392 ;; 393 getf.sig pr0_1 = fp0b_1 394 ldf8 r_1 = [srp], 8 395 xma.l fp1b_1 = uy, v1, fp1a_1 396 xma.hu fp2a_1 = uy, v1, fp1a_1 397 ;; 398 ldf8 u_1 = [up], 8 399 getf.sig pr1_0 = fp1b_0 400 ;; 401 getf.sig acc1_0 = fp2a_0 402 xma.l fp0b_3 = u_3, v0, r_3 403 xma.hu fp1a_3 = u_3, v0, r_3 404 br .LL11 405 406 407C *** MAIN LOOP START *** 408 ALIGN(32) 409.Loop: C 00 410 .pred.rel "mutex", p12, p13 411 getf.sig pr0_3 = fp0b_3 412 ldf8 r_3 = [srp], 8 413 xma.l fp1b_3 = u_3, v1, fp1a_3 414 (p12) add s0 = pr1_0, acc0, 1 415 (p13) add s0 = pr1_0, acc0 416 xma.hu fp2a_3 = u_3, v1, fp1a_3 417 ;; C 01 418 .pred.rel "mutex", p8, p9 419 .pred.rel "mutex", p12, p13 420 ldf8 u_3 = [up], 8 421 getf.sig pr1_2 = fp1b_2 422 (p8) cmp.leu p6, p7 = acc0, pr0_1 423 (p9) cmp.ltu p6, p7 = acc0, pr0_1 424 (p12) cmp.leu p10, p11 = s0, pr1_0 425 (p13) cmp.ltu p10, p11 = s0, pr1_0 426 ;; C 02 427 .pred.rel "mutex", p6, p7 428 getf.sig acc1_2 = fp2a_2 429 st8 [rp] = s0, 8 430 xma.l fp0b_1 = u_1, v0, r_1 431 (p6) add acc0 = pr0_2, acc1_0, 1 432 (p7) add acc0 = pr0_2, acc1_0 433 xma.hu fp1a_1 = u_1, v0, r_1 434 ;; C 03 435.LL01: 436 .pred.rel "mutex", p10, p11 437 getf.sig pr0_0 = fp0b_0 438 ldf8 r_0 = [srp], 8 439 xma.l fp1b_0 = u_0, v1, fp1a_0 440 (p10) add s0 = pr1_1, acc0, 1 441 (p11) add s0 = pr1_1, acc0 442 xma.hu fp2a_0 = u_0, v1, fp1a_0 443 ;; C 04 444 .pred.rel "mutex", p6, p7 445 .pred.rel "mutex", p10, p11 446 ldf8 u_0 = [up], 8 447 getf.sig pr1_3 = fp1b_3 448 (p6) cmp.leu p8, p9 = acc0, pr0_2 449 (p7) cmp.ltu p8, p9 = acc0, pr0_2 450 (p10) cmp.leu p12, p13 = s0, pr1_1 451 (p11) cmp.ltu p12, p13 = s0, pr1_1 452 ;; C 05 453 .pred.rel "mutex", p8, p9 454 getf.sig acc1_3 = fp2a_3 455 st8 [rp] = s0, 8 456 xma.l fp0b_2 = u_2, v0, r_2 457 (p8) add acc0 = pr0_3, acc1_1, 1 458 (p9) add acc0 = pr0_3, acc1_1 459 xma.hu fp1a_2 = u_2, v0, r_2 460 ;; C 06 461.LL00: 462 .pred.rel "mutex", p12, p13 463 getf.sig pr0_1 = fp0b_1 464 ldf8 r_1 = [srp], 8 465 xma.l fp1b_1 = u_1, v1, fp1a_1 466 (p12) add s0 = pr1_2, acc0, 1 467 (p13) add s0 = pr1_2, acc0 468 xma.hu fp2a_1 = u_1, v1, fp1a_1 469 ;; C 07 470 .pred.rel "mutex", p8, p9 471 .pred.rel "mutex", p12, p13 472 ldf8 u_1 = [up], 8 473 getf.sig pr1_0 = fp1b_0 474 (p8) cmp.leu p6, p7 = acc0, pr0_3 475 (p9) cmp.ltu p6, p7 = acc0, pr0_3 476 (p12) cmp.leu p10, p11 = s0, pr1_2 477 (p13) cmp.ltu p10, p11 = s0, pr1_2 478 ;; C 08 479 .pred.rel "mutex", p6, p7 480 getf.sig acc1_0 = fp2a_0 481 st8 [rp] = s0, 8 482 xma.l fp0b_3 = u_3, v0, r_3 483 (p6) add acc0 = pr0_0, acc1_2, 1 484 (p7) add acc0 = pr0_0, acc1_2 485 xma.hu fp1a_3 = u_3, v0, r_3 486 ;; C 09 487.LL11: 488 .pred.rel "mutex", p10, p11 489 getf.sig pr0_2 = fp0b_2 490 ldf8 r_2 = [srp], 8 491 xma.l fp1b_2 = u_2, v1, fp1a_2 492 (p10) add s0 = pr1_3, acc0, 1 493 (p11) add s0 = pr1_3, acc0 494 xma.hu fp2a_2 = u_2, v1, fp1a_2 495 ;; C 10 496 .pred.rel "mutex", p6, p7 497 .pred.rel "mutex", p10, p11 498 ldf8 u_2 = [up], 8 499 getf.sig pr1_1 = fp1b_1 500 (p6) cmp.leu p8, p9 = acc0, pr0_0 501 (p7) cmp.ltu p8, p9 = acc0, pr0_0 502 (p10) cmp.leu p12, p13 = s0, pr1_3 503 (p11) cmp.ltu p12, p13 = s0, pr1_3 504 ;; C 11 505 .pred.rel "mutex", p8, p9 506 getf.sig acc1_1 = fp2a_1 507 st8 [rp] = s0, 8 508 xma.l fp0b_0 = u_0, v0, r_0 509 (p8) add acc0 = pr0_1, acc1_3, 1 510 (p9) add acc0 = pr0_1, acc1_3 511 xma.hu fp1a_0 = u_0, v0, r_0 512.LL10: br.cloop.dptk .Loop C 12 513 ;; 514C *** MAIN LOOP END *** 515 516.Lcj6: 517 .pred.rel "mutex", p12, p13 518 getf.sig pr0_3 = fp0b_3 519 xma.l fp1b_3 = u_3, v1, fp1a_3 520 (p12) add s0 = pr1_0, acc0, 1 521 (p13) add s0 = pr1_0, acc0 522 xma.hu fp2a_3 = u_3, v1, fp1a_3 523 ;; 524 .pred.rel "mutex", p8, p9 525 .pred.rel "mutex", p12, p13 526 getf.sig pr1_2 = fp1b_2 527 (p8) cmp.leu p6, p7 = acc0, pr0_1 528 (p9) cmp.ltu p6, p7 = acc0, pr0_1 529 (p12) cmp.leu p10, p11 = s0, pr1_0 530 (p13) cmp.ltu p10, p11 = s0, pr1_0 531 ;; 532 .pred.rel "mutex", p6, p7 533 getf.sig acc1_2 = fp2a_2 534 st8 [rp] = s0, 8 535 xma.l fp0b_1 = u_1, v0, r_1 536 (p6) add acc0 = pr0_2, acc1_0, 1 537 (p7) add acc0 = pr0_2, acc1_0 538 xma.hu fp1a_1 = u_1, v0, r_1 539 ;; 540.Lcj5: 541 .pred.rel "mutex", p10, p11 542 getf.sig pr0_0 = fp0b_0 543 xma.l fp1b_0 = u_0, v1, fp1a_0 544 (p10) add s0 = pr1_1, acc0, 1 545 (p11) add s0 = pr1_1, acc0 546 xma.hu fp2a_0 = u_0, v1, fp1a_0 547 ;; 548 .pred.rel "mutex", p6, p7 549 .pred.rel "mutex", p10, p11 550 getf.sig pr1_3 = fp1b_3 551 (p6) cmp.leu p8, p9 = acc0, pr0_2 552 (p7) cmp.ltu p8, p9 = acc0, pr0_2 553 (p10) cmp.leu p12, p13 = s0, pr1_1 554 (p11) cmp.ltu p12, p13 = s0, pr1_1 555 ;; 556 .pred.rel "mutex", p8, p9 557 getf.sig acc1_3 = fp2a_3 558 st8 [rp] = s0, 8 559 xma.l fp0b_2 = u_2, v0, r_2 560 (p8) add acc0 = pr0_3, acc1_1, 1 561 (p9) add acc0 = pr0_3, acc1_1 562 xma.hu fp1a_2 = u_2, v0, r_2 563 ;; 564.Lcj4: 565 .pred.rel "mutex", p12, p13 566 getf.sig pr0_1 = fp0b_1 567 xma.l fp1b_1 = u_1, v1, fp1a_1 568 (p12) add s0 = pr1_2, acc0, 1 569 (p13) add s0 = pr1_2, acc0 570 xma.hu fp2a_1 = u_1, v1, fp1a_1 571 ;; 572 .pred.rel "mutex", p8, p9 573 .pred.rel "mutex", p12, p13 574 getf.sig pr1_0 = fp1b_0 575 (p8) cmp.leu p6, p7 = acc0, pr0_3 576 (p9) cmp.ltu p6, p7 = acc0, pr0_3 577 (p12) cmp.leu p10, p11 = s0, pr1_2 578 (p13) cmp.ltu p10, p11 = s0, pr1_2 579 ;; 580 .pred.rel "mutex", p6, p7 581 getf.sig acc1_0 = fp2a_0 582 st8 [rp] = s0, 8 583 (p6) add acc0 = pr0_0, acc1_2, 1 584 (p7) add acc0 = pr0_0, acc1_2 585 ;; 586.Lcj3: 587 .pred.rel "mutex", p10, p11 588 getf.sig pr0_2 = fp0b_2 589 xma.l fp1b_2 = u_2, v1, fp1a_2 590 (p10) add s0 = pr1_3, acc0, 1 591 (p11) add s0 = pr1_3, acc0 592 xma.hu fp2a_2 = u_2, v1, fp1a_2 593 ;; 594 .pred.rel "mutex", p6, p7 595 .pred.rel "mutex", p10, p11 596 getf.sig pr1_1 = fp1b_1 597 (p6) cmp.leu p8, p9 = acc0, pr0_0 598 (p7) cmp.ltu p8, p9 = acc0, pr0_0 599 (p10) cmp.leu p12, p13 = s0, pr1_3 600 (p11) cmp.ltu p12, p13 = s0, pr1_3 601 ;; 602 .pred.rel "mutex", p8, p9 603 getf.sig acc1_1 = fp2a_1 604 st8 [rp] = s0, 8 605 (p8) add acc0 = pr0_1, acc1_3, 1 606 (p9) add acc0 = pr0_1, acc1_3 607 ;; 608.Lcj2: 609 .pred.rel "mutex", p12, p13 610 (p12) add s0 = pr1_0, acc0, 1 611 (p13) add s0 = pr1_0, acc0 612 ;; 613 .pred.rel "mutex", p8, p9 614 .pred.rel "mutex", p12, p13 615 getf.sig pr1_2 = fp1b_2 616 (p8) cmp.leu p6, p7 = acc0, pr0_1 617 (p9) cmp.ltu p6, p7 = acc0, pr0_1 618 (p12) cmp.leu p10, p11 = s0, pr1_0 619 (p13) cmp.ltu p10, p11 = s0, pr1_0 620 ;; 621 .pred.rel "mutex", p6, p7 622 getf.sig acc1_2 = fp2a_2 623 st8 [rp] = s0, 8 624 (p6) add acc0 = pr0_2, acc1_0, 1 625 (p7) add acc0 = pr0_2, acc1_0 626 ;; 627 .pred.rel "mutex", p10, p11 628 (p10) add s0 = pr1_1, acc0, 1 629 (p11) add s0 = pr1_1, acc0 630 ;; 631 .pred.rel "mutex", p6, p7 632 .pred.rel "mutex", p10, p11 633 (p6) cmp.leu p8, p9 = acc0, pr0_2 634 (p7) cmp.ltu p8, p9 = acc0, pr0_2 635 (p10) cmp.leu p12, p13 = s0, pr1_1 636 (p11) cmp.ltu p12, p13 = s0, pr1_1 637 ;; 638 .pred.rel "mutex", p8, p9 639 st8 [rp] = s0, 8 640 (p8) add acc0 = pr1_2, acc1_1, 1 641 (p9) add acc0 = pr1_2, acc1_1 642 ;; 643 .pred.rel "mutex", p8, p9 644 (p8) cmp.leu p10, p11 = acc0, pr1_2 645 (p9) cmp.ltu p10, p11 = acc0, pr1_2 646 (p12) add acc0 = 1, acc0 647 ;; 648 st8 [rp] = acc0, 8 649 (p12) cmp.eq.or p10, p0 = 0, acc0 650 mov r8 = acc1_2 651 ;; 652 .pred.rel "mutex", p10, p11 653 (p10) add r8 = 1, r8 654 mov.i ar.lc = r2 655 br.ret.sptk.many b0 656EPILOGUE() 657ASM_END() 658