submul_1.asm revision 1.1.1.1
1dnl HP-PA 2.0 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and 2dnl subtract the result from a second limb vector. 3 4dnl Copyright 1998, 1999, 2000, 2002, 2003 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23C cycles/limb 24C 8000,8200: 7 25C 8500,8600,8700: 6.5 26 27C The feed-in and wind-down code has not yet been scheduled. Many cycles 28C could be saved there per call. 29 30C DESCRIPTION: 31C The main loop "BIG" is 4-way unrolled, mainly to allow 32C effective use of ADD,DC. Delays in moving data via the cache from the FP 33C registers to the IU registers, have demanded a deep software pipeline, and 34C a lot of stack slots for partial products in flight. 35C 36C CODE STRUCTURE: 37C save-some-registers 38C do 0, 1, 2, or 3 limbs 39C if done, restore-some-regs and return 40C save-many-regs 41C do 4, 8, ... limb 42C restore-all-regs 43 44C STACK LAYOUT: 45C HP-PA stack grows upwards. We could allocate 8 fewer slots by using the 46C slots marked FREE, as well as some slots in the caller's "frame marker". 47C 48C -00 <- r30 49C -08 FREE 50C -10 tmp 51C -18 tmp 52C -20 tmp 53C -28 tmp 54C -30 tmp 55C -38 tmp 56C -40 tmp 57C -48 tmp 58C -50 tmp 59C -58 tmp 60C -60 tmp 61C -68 tmp 62C -70 tmp 63C -78 tmp 64C -80 tmp 65C -88 tmp 66C -90 FREE 67C -98 FREE 68C -a0 FREE 69C -a8 FREE 70C -b0 r13 71C -b8 r12 72C -c0 r11 73C -c8 r10 74C -d0 r8 75C -d8 r8 76C -e0 r7 77C -e8 r6 78C -f0 r5 79C -f8 r4 80C -100 r3 81C Previous frame: 82C [unused area] 83C -38/-138 vlimb home slot. For 2.0N, the vlimb arg will arrive here. 84 85 86include(`../config.m4') 87 88C INPUT PARAMETERS: 89define(`rp',`%r26') C 90define(`up',`%r25') C 91define(`n',`%r24') C 92define(`vlimb',`%r23') C 93 94define(`climb',`%r23') C 95 96ifdef(`HAVE_ABI_2_0w', 97` .level 2.0w 98',` .level 2.0 99') 100PROLOGUE(mpn_submul_1) 101 102ifdef(`HAVE_ABI_2_0w', 103` std vlimb, -0x38(%r30) C store vlimb into "home" slot 104') 105 std,ma %r3, 0x100(%r30) 106 std %r4, -0xf8(%r30) 107 std %r5, -0xf0(%r30) 108 ldo 0(%r0), climb C clear climb 109 fldd -0x138(%r30), %fr8 C put vlimb in fp register 110 111define(`p032a1',`%r1') C 112define(`p032a2',`%r19') C 113 114define(`m032',`%r20') C 115define(`m096',`%r21') C 116 117define(`p000a',`%r22') C 118define(`p064a',`%r29') C 119 120define(`s000',`%r31') C 121 122define(`ma000',`%r4') C 123define(`ma064',`%r20') C 124 125define(`r000',`%r3') C 126 127 extrd,u n, 63, 2, %r5 128 cmpb,= %r5, %r0, L(BIG) 129 nop 130 131 fldd 0(up), %fr4 132 ldo 8(up), up 133 xmpyu %fr8R, %fr4L, %fr22 134 xmpyu %fr8L, %fr4R, %fr23 135 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 136 xmpyu %fr8R, %fr4R, %fr24 137 xmpyu %fr8L, %fr4L, %fr25 138 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 139 fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 140 addib,<> -1, %r5, L(two_or_more) 141 fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 142LDEF(one) 143 ldd -0x78(%r30), p032a1 144 ldd -0x70(%r30), p032a2 145 ldd -0x80(%r30), p000a 146 b L(0_one_out) 147 ldd -0x68(%r30), p064a 148 149LDEF(two_or_more) 150 fldd 0(up), %fr4 151 ldo 8(up), up 152 xmpyu %fr8R, %fr4L, %fr22 153 xmpyu %fr8L, %fr4R, %fr23 154 ldd -0x78(%r30), p032a1 155 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 156 xmpyu %fr8R, %fr4R, %fr24 157 xmpyu %fr8L, %fr4L, %fr25 158 ldd -0x70(%r30), p032a2 159 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 160 ldd -0x80(%r30), p000a 161 fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 162 ldd -0x68(%r30), p064a 163 addib,<> -1, %r5, L(three_or_more) 164 fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 165LDEF(two) 166 add p032a1, p032a2, m032 167 add,dc %r0, %r0, m096 168 depd,z m032, 31, 32, ma000 169 extrd,u m032, 31, 32, ma064 170 ldd 0(rp), r000 171 b L(0_two_out) 172 depd m096, 31, 32, ma064 173 174LDEF(three_or_more) 175 fldd 0(up), %fr4 176 add p032a1, p032a2, m032 177 add,dc %r0, %r0, m096 178 depd,z m032, 31, 32, ma000 179 extrd,u m032, 31, 32, ma064 180 ldd 0(rp), r000 181C addib,= -1, %r5, L(0_out) 182 depd m096, 31, 32, ma064 183LDEF(loop0) 184C xmpyu %fr8R, %fr4L, %fr22 185C xmpyu %fr8L, %fr4R, %fr23 186C ldd -0x78(%r30), p032a1 187C fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 188C 189C xmpyu %fr8R, %fr4R, %fr24 190C xmpyu %fr8L, %fr4L, %fr25 191C ldd -0x70(%r30), p032a2 192C fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 193C 194C ldo 8(rp), rp 195C add climb, p000a, s000 196C ldd -0x80(%r30), p000a 197C fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 198C 199C add,dc p064a, %r0, climb 200C ldo 8(up), up 201C ldd -0x68(%r30), p064a 202C fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 203C 204C add ma000, s000, s000 205C add,dc ma064, climb, climb 206C fldd 0(up), %fr4 207C 208C sub r000, s000, s000 209C sub,db %r0, climb, climb 210C sub %r0, climb, climb 211C std s000, -8(rp) 212C 213C add p032a1, p032a2, m032 214C add,dc %r0, %r0, m096 215C 216C depd,z m032, 31, 32, ma000 217C extrd,u m032, 31, 32, ma064 218C ldd 0(rp), r000 219C addib,<> -1, %r5, L(loop0) 220C depd m096, 31, 32, ma064 221LDEF(0_out) 222 ldo 8(up), up 223 xmpyu %fr8R, %fr4L, %fr22 224 xmpyu %fr8L, %fr4R, %fr23 225 ldd -0x78(%r30), p032a1 226 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 227 xmpyu %fr8R, %fr4R, %fr24 228 xmpyu %fr8L, %fr4L, %fr25 229 ldd -0x70(%r30), p032a2 230 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 231 ldo 8(rp), rp 232 add climb, p000a, s000 233 ldd -0x80(%r30), p000a 234 fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79 235 add,dc p064a, %r0, climb 236 ldd -0x68(%r30), p064a 237 fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61 238 add ma000, s000, s000 239 add,dc ma064, climb, climb 240 sub r000, s000, s000 241 sub,db %r0, climb, climb 242 sub %r0, climb, climb 243 std s000, -8(rp) 244 add p032a1, p032a2, m032 245 add,dc %r0, %r0, m096 246 depd,z m032, 31, 32, ma000 247 extrd,u m032, 31, 32, ma064 248 ldd 0(rp), r000 249 depd m096, 31, 32, ma064 250LDEF(0_two_out) 251 ldd -0x78(%r30), p032a1 252 ldd -0x70(%r30), p032a2 253 ldo 8(rp), rp 254 add climb, p000a, s000 255 ldd -0x80(%r30), p000a 256 add,dc p064a, %r0, climb 257 ldd -0x68(%r30), p064a 258 add ma000, s000, s000 259 add,dc ma064, climb, climb 260 sub r000, s000, s000 261 sub,db %r0, climb, climb 262 sub %r0, climb, climb 263 std s000, -8(rp) 264LDEF(0_one_out) 265 add p032a1, p032a2, m032 266 add,dc %r0, %r0, m096 267 depd,z m032, 31, 32, ma000 268 extrd,u m032, 31, 32, ma064 269 ldd 0(rp), r000 270 depd m096, 31, 32, ma064 271 272 add climb, p000a, s000 273 add,dc p064a, %r0, climb 274 add ma000, s000, s000 275 add,dc ma064, climb, climb 276 sub r000, s000, s000 277 sub,db %r0, climb, climb 278 sub %r0, climb, climb 279 std s000, 0(rp) 280 281 cmpib,>= 4, n, L(done) 282 ldo 8(rp), rp 283 284C 4-way unrolled code. 285 286LDEF(BIG) 287 288define(`p032a1',`%r1') C 289define(`p032a2',`%r19') C 290define(`p096b1',`%r20') C 291define(`p096b2',`%r21') C 292define(`p160c1',`%r22') C 293define(`p160c2',`%r29') C 294define(`p224d1',`%r31') C 295define(`p224d2',`%r3') C 296 C 297define(`m032',`%r4') C 298define(`m096',`%r5') C 299define(`m160',`%r6') C 300define(`m224',`%r7') C 301define(`m288',`%r8') C 302 C 303define(`p000a',`%r1') C 304define(`p064a',`%r19') C 305define(`p064b',`%r20') C 306define(`p128b',`%r21') C 307define(`p128c',`%r22') C 308define(`p192c',`%r29') C 309define(`p192d',`%r31') C 310define(`p256d',`%r3') C 311 C 312define(`s000',`%r10') C 313define(`s064',`%r11') C 314define(`s128',`%r12') C 315define(`s192',`%r13') C 316 C 317define(`ma000',`%r9') C 318define(`ma064',`%r4') C 319define(`ma128',`%r5') C 320define(`ma192',`%r6') C 321define(`ma256',`%r7') C 322 C 323define(`r000',`%r1') C 324define(`r064',`%r19') C 325define(`r128',`%r20') C 326define(`r192',`%r21') C 327 328 std %r6, -0xe8(%r30) 329 std %r7, -0xe0(%r30) 330 std %r8, -0xd8(%r30) 331 std %r9, -0xd0(%r30) 332 std %r10, -0xc8(%r30) 333 std %r11, -0xc0(%r30) 334 std %r12, -0xb8(%r30) 335 std %r13, -0xb0(%r30) 336 337ifdef(`HAVE_ABI_2_0w', 338` extrd,u n, 61, 62, n C right shift 2 339',` extrd,u n, 61, 30, n C right shift 2, zero extend 340') 341 342LDEF(4_or_more) 343 fldd 0(up), %fr4 344 fldd 8(up), %fr5 345 fldd 16(up), %fr6 346 fldd 24(up), %fr7 347 xmpyu %fr8R, %fr4L, %fr22 348 xmpyu %fr8L, %fr4R, %fr23 349 xmpyu %fr8R, %fr5L, %fr24 350 xmpyu %fr8L, %fr5R, %fr25 351 xmpyu %fr8R, %fr6L, %fr26 352 xmpyu %fr8L, %fr6R, %fr27 353 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 354 xmpyu %fr8R, %fr7L, %fr28 355 xmpyu %fr8L, %fr7R, %fr29 356 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 357 xmpyu %fr8R, %fr4R, %fr30 358 xmpyu %fr8L, %fr4L, %fr31 359 fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31 360 xmpyu %fr8R, %fr5R, %fr22 361 xmpyu %fr8L, %fr5L, %fr23 362 fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29 363 xmpyu %fr8R, %fr6R, %fr24 364 xmpyu %fr8L, %fr6L, %fr25 365 fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51 366 xmpyu %fr8R, %fr7R, %fr26 367 fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49 368 addib,<> -1, n, L(8_or_more) 369 xmpyu %fr8L, %fr7L, %fr27 370 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 371 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 372 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 373 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 374 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 375 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 376 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 377 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 378 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 379 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 380 ldd -0x78(%r30), p032a1 381 ldd -0x70(%r30), p032a2 382 ldd -0x38(%r30), p096b1 383 ldd -0x30(%r30), p096b2 384 ldd -0x58(%r30), p160c1 385 ldd -0x50(%r30), p160c2 386 ldd -0x18(%r30), p224d1 387 ldd -0x10(%r30), p224d2 388 b L(end1) 389 nop 390 391LDEF(8_or_more) 392 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 393 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 394 ldo 32(up), up 395 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 396 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 397 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 398 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 399 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 400 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 401 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 402 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 403 fldd 0(up), %fr4 404 fldd 8(up), %fr5 405 fldd 16(up), %fr6 406 fldd 24(up), %fr7 407 xmpyu %fr8R, %fr4L, %fr22 408 ldd -0x78(%r30), p032a1 409 xmpyu %fr8L, %fr4R, %fr23 410 xmpyu %fr8R, %fr5L, %fr24 411 ldd -0x70(%r30), p032a2 412 xmpyu %fr8L, %fr5R, %fr25 413 xmpyu %fr8R, %fr6L, %fr26 414 ldd -0x38(%r30), p096b1 415 xmpyu %fr8L, %fr6R, %fr27 416 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 417 xmpyu %fr8R, %fr7L, %fr28 418 ldd -0x30(%r30), p096b2 419 xmpyu %fr8L, %fr7R, %fr29 420 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 421 xmpyu %fr8R, %fr4R, %fr30 422 ldd -0x58(%r30), p160c1 423 xmpyu %fr8L, %fr4L, %fr31 424 fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31 425 xmpyu %fr8R, %fr5R, %fr22 426 ldd -0x50(%r30), p160c2 427 xmpyu %fr8L, %fr5L, %fr23 428 fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29 429 xmpyu %fr8R, %fr6R, %fr24 430 ldd -0x18(%r30), p224d1 431 xmpyu %fr8L, %fr6L, %fr25 432 fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51 433 xmpyu %fr8R, %fr7R, %fr26 434 ldd -0x10(%r30), p224d2 435 fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49 436 addib,= -1, n, L(end2) 437 xmpyu %fr8L, %fr7L, %fr27 438LDEF(loop) 439 add p032a1, p032a2, m032 440 ldd -0x80(%r30), p000a 441 add,dc p096b1, p096b2, m096 442 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 443 444 add,dc p160c1, p160c2, m160 445 ldd -0x68(%r30), p064a 446 add,dc p224d1, p224d2, m224 447 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 448 449 add,dc %r0, %r0, m288 450 ldd -0x40(%r30), p064b 451 ldo 32(up), up 452 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 453 454 depd,z m032, 31, 32, ma000 455 ldd -0x28(%r30), p128b 456 extrd,u m032, 31, 32, ma064 457 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 458 459 depd m096, 31, 32, ma064 460 ldd -0x60(%r30), p128c 461 extrd,u m096, 31, 32, ma128 462 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 463 464 depd m160, 31, 32, ma128 465 ldd -0x48(%r30), p192c 466 extrd,u m160, 31, 32, ma192 467 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 468 469 depd m224, 31, 32, ma192 470 ldd -0x20(%r30), p192d 471 extrd,u m224, 31, 32, ma256 472 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 473 474 depd m288, 31, 32, ma256 475 ldd -0x88(%r30), p256d 476 add climb, p000a, s000 477 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 478 479 add,dc p064a, p064b, s064 480 ldd 0(rp), r000 481 add,dc p128b, p128c, s128 482 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 483 484 add,dc p192c, p192d, s192 485 ldd 8(rp), r064 486 add,dc p256d, %r0, climb 487 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 488 489 ldd 16(rp), r128 490 add ma000, s000, s000 C accum mid 0 491 ldd 24(rp), r192 492 add,dc ma064, s064, s064 C accum mid 1 493 494 add,dc ma128, s128, s128 C accum mid 2 495 fldd 0(up), %fr4 496 add,dc ma192, s192, s192 C accum mid 3 497 fldd 8(up), %fr5 498 499 add,dc ma256, climb, climb 500 fldd 16(up), %fr6 501 sub r000, s000, s000 C accum rlimb 0 502 fldd 24(up), %fr7 503 504 sub,db r064, s064, s064 C accum rlimb 1 505 sub,db r128, s128, s128 C accum rlimb 2 506 std s000, 0(rp) 507 508 sub,db r192, s192, s192 C accum rlimb 3 509 sub,db %r0, climb, climb 510 sub %r0, climb, climb 511 std s064, 8(rp) 512 513 xmpyu %fr8R, %fr4L, %fr22 514 ldd -0x78(%r30), p032a1 515 xmpyu %fr8L, %fr4R, %fr23 516 std s128, 16(rp) 517 518 xmpyu %fr8R, %fr5L, %fr24 519 ldd -0x70(%r30), p032a2 520 xmpyu %fr8L, %fr5R, %fr25 521 std s192, 24(rp) 522 523 xmpyu %fr8R, %fr6L, %fr26 524 ldd -0x38(%r30), p096b1 525 xmpyu %fr8L, %fr6R, %fr27 526 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71 527 528 xmpyu %fr8R, %fr7L, %fr28 529 ldd -0x30(%r30), p096b2 530 xmpyu %fr8L, %fr7R, %fr29 531 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69 532 533 xmpyu %fr8R, %fr4R, %fr30 534 ldd -0x58(%r30), p160c1 535 xmpyu %fr8L, %fr4L, %fr31 536 fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31 537 538 xmpyu %fr8R, %fr5R, %fr22 539 ldd -0x50(%r30), p160c2 540 xmpyu %fr8L, %fr5L, %fr23 541 fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29 542 543 xmpyu %fr8R, %fr6R, %fr24 544 ldd -0x18(%r30), p224d1 545 xmpyu %fr8L, %fr6L, %fr25 546 fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51 547 548 xmpyu %fr8R, %fr7R, %fr26 549 ldd -0x10(%r30), p224d2 550 fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49 551 xmpyu %fr8L, %fr7L, %fr27 552 553 addib,<> -1, n, L(loop) 554 ldo 32(rp), rp 555 556LDEF(end2) 557 add p032a1, p032a2, m032 558 ldd -0x80(%r30), p000a 559 add,dc p096b1, p096b2, m096 560 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11 561 add,dc p160c1, p160c2, m160 562 ldd -0x68(%r30), p064a 563 add,dc p224d1, p224d2, m224 564 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09 565 add,dc %r0, %r0, m288 566 ldd -0x40(%r30), p064b 567 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79 568 depd,z m032, 31, 32, ma000 569 ldd -0x28(%r30), p128b 570 extrd,u m032, 31, 32, ma064 571 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61 572 depd m096, 31, 32, ma064 573 ldd -0x60(%r30), p128c 574 extrd,u m096, 31, 32, ma128 575 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39 576 depd m160, 31, 32, ma128 577 ldd -0x48(%r30), p192c 578 extrd,u m160, 31, 32, ma192 579 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21 580 depd m224, 31, 32, ma192 581 ldd -0x20(%r30), p192d 582 extrd,u m224, 31, 32, ma256 583 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59 584 depd m288, 31, 32, ma256 585 ldd -0x88(%r30), p256d 586 add climb, p000a, s000 587 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41 588 add,dc p064a, p064b, s064 589 ldd 0(rp), r000 590 add,dc p128b, p128c, s128 591 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19 592 add,dc p192c, p192d, s192 593 ldd 8(rp), r064 594 add,dc p256d, %r0, climb 595 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81 596 ldd 16(rp), r128 597 add ma000, s000, s000 C accum mid 0 598 ldd 24(rp), r192 599 add,dc ma064, s064, s064 C accum mid 1 600 add,dc ma128, s128, s128 C accum mid 2 601 add,dc ma192, s192, s192 C accum mid 3 602 add,dc ma256, climb, climb 603 sub r000, s000, s000 C accum rlimb 0 604 sub,db r064, s064, s064 C accum rlimb 1 605 sub,db r128, s128, s128 C accum rlimb 2 606 std s000, 0(rp) 607 sub,db r192, s192, s192 C accum rlimb 3 608 sub,db %r0, climb, climb 609 sub %r0, climb, climb 610 std s064, 8(rp) 611 ldd -0x78(%r30), p032a1 612 std s128, 16(rp) 613 ldd -0x70(%r30), p032a2 614 std s192, 24(rp) 615 ldd -0x38(%r30), p096b1 616 ldd -0x30(%r30), p096b2 617 ldd -0x58(%r30), p160c1 618 ldd -0x50(%r30), p160c2 619 ldd -0x18(%r30), p224d1 620 ldd -0x10(%r30), p224d2 621 ldo 32(rp), rp 622 623LDEF(end1) 624 add p032a1, p032a2, m032 625 ldd -0x80(%r30), p000a 626 add,dc p096b1, p096b2, m096 627 add,dc p160c1, p160c2, m160 628 ldd -0x68(%r30), p064a 629 add,dc p224d1, p224d2, m224 630 add,dc %r0, %r0, m288 631 ldd -0x40(%r30), p064b 632 depd,z m032, 31, 32, ma000 633 ldd -0x28(%r30), p128b 634 extrd,u m032, 31, 32, ma064 635 depd m096, 31, 32, ma064 636 ldd -0x60(%r30), p128c 637 extrd,u m096, 31, 32, ma128 638 depd m160, 31, 32, ma128 639 ldd -0x48(%r30), p192c 640 extrd,u m160, 31, 32, ma192 641 depd m224, 31, 32, ma192 642 ldd -0x20(%r30), p192d 643 extrd,u m224, 31, 32, ma256 644 depd m288, 31, 32, ma256 645 ldd -0x88(%r30), p256d 646 add climb, p000a, s000 647 add,dc p064a, p064b, s064 648 ldd 0(rp), r000 649 add,dc p128b, p128c, s128 650 add,dc p192c, p192d, s192 651 ldd 8(rp), r064 652 add,dc p256d, %r0, climb 653 ldd 16(rp), r128 654 add ma000, s000, s000 C accum mid 0 655 ldd 24(rp), r192 656 add,dc ma064, s064, s064 C accum mid 1 657 add,dc ma128, s128, s128 C accum mid 2 658 add,dc ma192, s192, s192 C accum mid 3 659 add,dc ma256, climb, climb 660 sub r000, s000, s000 C accum rlimb 0 661 sub,db r064, s064, s064 C accum rlimb 1 662 sub,db r128, s128, s128 C accum rlimb 2 663 std s000, 0(rp) 664 sub,db r192, s192, s192 C accum rlimb 3 665 sub,db %r0, climb, climb 666 sub %r0, climb, climb 667 std s064, 8(rp) 668 std s128, 16(rp) 669 std s192, 24(rp) 670 671 ldd -0xb0(%r30), %r13 672 ldd -0xb8(%r30), %r12 673 ldd -0xc0(%r30), %r11 674 ldd -0xc8(%r30), %r10 675 ldd -0xd0(%r30), %r9 676 ldd -0xd8(%r30), %r8 677 ldd -0xe0(%r30), %r7 678 ldd -0xe8(%r30), %r6 679LDEF(done) 680ifdef(`HAVE_ABI_2_0w', 681` copy climb, %r28 682',` extrd,u climb, 63, 32, %r29 683 extrd,u climb, 31, 32, %r28 684') 685 ldd -0xf0(%r30), %r5 686 ldd -0xf8(%r30), %r4 687 bve (%r2) 688 ldd,mb -0x100(%r30), %r3 689EPILOGUE(mpn_submul_1) 690