1#ifdef L__divtf3 2// Compute a 80-bit IEEE double-extended quotient. 3// 4// From the Intel IA-64 Optimization Guide, choose the minimum latency 5// alternative. 6// 7// farg0 holds the dividend. farg1 holds the divisor. 8 9 .text 10 .align 16 11 .global __divtf3 12 .proc __divtf3 13__divtf3: 14 cmp.eq p7, p0 = r0, r0 15 frcpa.s0 f10, p6 = farg0, farg1 16 ;; 17(p6) cmp.ne p7, p0 = r0, r0 18 .pred.rel.mutex p6, p7 19(p6) fnma.s1 f11 = farg1, f10, f1 20(p6) fma.s1 f12 = farg0, f10, f0 21 ;; 22(p6) fma.s1 f13 = f11, f11, f0 23(p6) fma.s1 f14 = f11, f11, f11 24 ;; 25(p6) fma.s1 f11 = f13, f13, f11 26(p6) fma.s1 f13 = f14, f10, f10 27 ;; 28(p6) fma.s1 f10 = f13, f11, f10 29(p6) fnma.s1 f11 = farg1, f12, farg0 30 ;; 31(p6) fma.s1 f11 = f11, f10, f12 32(p6) fnma.s1 f12 = farg1, f10, f1 33 ;; 34(p6) fma.s1 f10 = f12, f10, f10 35(p6) fnma.s1 f12 = farg1, f11, farg0 36 ;; 37(p6) fma.s0 fret0 = f12, f10, f11 38(p7) mov fret0 = f10 39 br.ret.sptk rp 40 .endp __divtf3 41#endif 42 43#ifdef L__divdf3 44// Compute a 64-bit IEEE double quotient. 45// 46// From the Intel IA-64 Optimization Guide, choose the minimum latency 47// alternative. 48// 49// farg0 holds the dividend. farg1 holds the divisor. 50 51 .text 52 .align 16 53 .global __divdf3 54 .proc __divdf3 55__divdf3: 56 cmp.eq p7, p0 = r0, r0 57 frcpa.s0 f10, p6 = farg0, farg1 58 ;; 59(p6) cmp.ne p7, p0 = r0, r0 60 .pred.rel.mutex p6, p7 61(p6) fmpy.s1 f11 = farg0, f10 62(p6) fnma.s1 f12 = farg1, f10, f1 63 ;; 64(p6) fma.s1 f11 = f12, f11, f11 65(p6) fmpy.s1 f13 = f12, f12 66 ;; 67(p6) fma.s1 f10 = f12, f10, f10 68(p6) fma.s1 f11 = f13, f11, f11 69 ;; 70(p6) fmpy.s1 f12 = f13, f13 71(p6) fma.s1 f10 = f13, f10, f10 72 ;; 73(p6) fma.d.s1 f11 = f12, f11, f11 74(p6) fma.s1 f10 = f12, f10, f10 75 ;; 76(p6) fnma.d.s1 f8 = farg1, f11, farg0 77 ;; 78(p6) fma.d fret0 = f8, f10, f11 79(p7) mov fret0 = f10 80 br.ret.sptk rp 81 ;; 82 .endp __divdf3 83#endif 84 85#ifdef L__divsf3 86// Compute a 32-bit IEEE float quotient. 87// 88// From the Intel IA-64 Optimization Guide, choose the minimum latency 89// alternative. 90// 91// farg0 holds the dividend. farg1 holds the divisor. 92 93 .text 94 .align 16 95 .global __divsf3 96 .proc __divsf3 97__divsf3: 98 cmp.eq p7, p0 = r0, r0 99 frcpa.s0 f10, p6 = farg0, farg1 100 ;; 101(p6) cmp.ne p7, p0 = r0, r0 102 .pred.rel.mutex p6, p7 103(p6) fmpy.s1 f8 = farg0, f10 104(p6) fnma.s1 f9 = farg1, f10, f1 105 ;; 106(p6) fma.s1 f8 = f9, f8, f8 107(p6) fmpy.s1 f9 = f9, f9 108 ;; 109(p6) fma.s1 f8 = f9, f8, f8 110(p6) fmpy.s1 f9 = f9, f9 111 ;; 112(p6) fma.d.s1 f10 = f9, f8, f8 113 ;; 114(p6) fnorm.s.s0 fret0 = f10 115(p7) mov fret0 = f10 116 br.ret.sptk rp 117 ;; 118 .endp __divsf3 119#endif 120 121#ifdef L__divdi3 122// Compute a 64-bit integer quotient. 123// 124// From the Intel IA-64 Optimization Guide, choose the minimum latency 125// alternative. 126// 127// in0 holds the dividend. in1 holds the divisor. 128 129 .text 130 .align 16 131 .global __divdi3 132 .proc __divdi3 133__divdi3: 134 .regstk 2,0,0,0 135 // Transfer inputs to FP registers. 136 setf.sig f8 = in0 137 setf.sig f9 = in1 138 ;; 139 // Convert the inputs to FP, so that they won't be treated as unsigned. 140 fcvt.xf f8 = f8 141 fcvt.xf f9 = f9 142 ;; 143 // Compute the reciprocal approximation. 144 frcpa.s1 f10, p6 = f8, f9 145 ;; 146 // 3 Newton-Raphson iterations. 147(p6) fnma.s1 f11 = f9, f10, f1 148(p6) fmpy.s1 f12 = f8, f10 149 ;; 150(p6) fmpy.s1 f13 = f11, f11 151(p6) fma.s1 f12 = f11, f12, f12 152 ;; 153(p6) fma.s1 f10 = f11, f10, f10 154(p6) fma.s1 f11 = f13, f12, f12 155 ;; 156(p6) fma.s1 f10 = f13, f10, f10 157(p6) fnma.s1 f12 = f9, f11, f8 158 ;; 159(p6) fma.s1 f10 = f12, f10, f11 160 ;; 161 // Round quotient to an integer. 162 fcvt.fx.trunc.s1 f10 = f10 163 ;; 164 // Transfer result to GP registers. 165 getf.sig ret0 = f10 166 br.ret.sptk rp 167 ;; 168 .endp __divdi3 169#endif 170 171#ifdef L__moddi3 172// Compute a 64-bit integer modulus. 173// 174// From the Intel IA-64 Optimization Guide, choose the minimum latency 175// alternative. 176// 177// in0 holds the dividend (a). in1 holds the divisor (b). 178 179 .text 180 .align 16 181 .global __moddi3 182 .proc __moddi3 183__moddi3: 184 .regstk 2,0,0,0 185 // Transfer inputs to FP registers. 186 setf.sig f14 = in0 187 setf.sig f9 = in1 188 ;; 189 // Convert the inputs to FP, so that they won't be treated as unsigned. 190 fcvt.xf f8 = f14 191 fcvt.xf f9 = f9 192 ;; 193 // Compute the reciprocal approximation. 194 frcpa.s1 f10, p6 = f8, f9 195 ;; 196 // 3 Newton-Raphson iterations. 197(p6) fmpy.s1 f12 = f8, f10 198(p6) fnma.s1 f11 = f9, f10, f1 199 ;; 200(p6) fma.s1 f12 = f11, f12, f12 201(p6) fmpy.s1 f13 = f11, f11 202 ;; 203(p6) fma.s1 f10 = f11, f10, f10 204(p6) fma.s1 f11 = f13, f12, f12 205 ;; 206 sub in1 = r0, in1 207(p6) fma.s1 f10 = f13, f10, f10 208(p6) fnma.s1 f12 = f9, f11, f8 209 ;; 210 setf.sig f9 = in1 211(p6) fma.s1 f10 = f12, f10, f11 212 ;; 213 fcvt.fx.trunc.s1 f10 = f10 214 ;; 215 // r = q * (-b) + a 216 xma.l f10 = f10, f9, f14 217 ;; 218 // Transfer result to GP registers. 219 getf.sig ret0 = f10 220 br.ret.sptk rp 221 ;; 222 .endp __moddi3 223#endif 224 225#ifdef L__udivdi3 226// Compute a 64-bit unsigned integer quotient. 227// 228// From the Intel IA-64 Optimization Guide, choose the minimum latency 229// alternative. 230// 231// in0 holds the dividend. in1 holds the divisor. 232 233 .text 234 .align 16 235 .global __udivdi3 236 .proc __udivdi3 237__udivdi3: 238 .regstk 2,0,0,0 239 // Transfer inputs to FP registers. 240 setf.sig f8 = in0 241 setf.sig f9 = in1 242 ;; 243 // Convert the inputs to FP, to avoid FP software-assist faults. 244 fcvt.xuf.s1 f8 = f8 245 fcvt.xuf.s1 f9 = f9 246 ;; 247 // Compute the reciprocal approximation. 248 frcpa.s1 f10, p6 = f8, f9 249 ;; 250 // 3 Newton-Raphson iterations. 251(p6) fnma.s1 f11 = f9, f10, f1 252(p6) fmpy.s1 f12 = f8, f10 253 ;; 254(p6) fmpy.s1 f13 = f11, f11 255(p6) fma.s1 f12 = f11, f12, f12 256 ;; 257(p6) fma.s1 f10 = f11, f10, f10 258(p6) fma.s1 f11 = f13, f12, f12 259 ;; 260(p6) fma.s1 f10 = f13, f10, f10 261(p6) fnma.s1 f12 = f9, f11, f8 262 ;; 263(p6) fma.s1 f10 = f12, f10, f11 264 ;; 265 // Round quotient to an unsigned integer. 266 fcvt.fxu.trunc.s1 f10 = f10 267 ;; 268 // Transfer result to GP registers. 269 getf.sig ret0 = f10 270 br.ret.sptk rp 271 ;; 272 .endp __udivdi3 273#endif 274 275#ifdef L__umoddi3 276// Compute a 64-bit unsigned integer modulus. 277// 278// From the Intel IA-64 Optimization Guide, choose the minimum latency 279// alternative. 280// 281// in0 holds the dividend (a). in1 holds the divisor (b). 282 283 .text 284 .align 16 285 .global __umoddi3 286 .proc __umoddi3 287__umoddi3: 288 .regstk 2,0,0,0 289 // Transfer inputs to FP registers. 290 setf.sig f14 = in0 291 setf.sig f9 = in1 292 ;; 293 // Convert the inputs to FP, to avoid FP software assist faults. 294 fcvt.xuf.s1 f8 = f14 295 fcvt.xuf.s1 f9 = f9 296 ;; 297 // Compute the reciprocal approximation. 298 frcpa.s1 f10, p6 = f8, f9 299 ;; 300 // 3 Newton-Raphson iterations. 301(p6) fmpy.s1 f12 = f8, f10 302(p6) fnma.s1 f11 = f9, f10, f1 303 ;; 304(p6) fma.s1 f12 = f11, f12, f12 305(p6) fmpy.s1 f13 = f11, f11 306 ;; 307(p6) fma.s1 f10 = f11, f10, f10 308(p6) fma.s1 f11 = f13, f12, f12 309 ;; 310 sub in1 = r0, in1 311(p6) fma.s1 f10 = f13, f10, f10 312(p6) fnma.s1 f12 = f9, f11, f8 313 ;; 314 setf.sig f9 = in1 315(p6) fma.s1 f10 = f12, f10, f11 316 ;; 317 // Round quotient to an unsigned integer. 318 fcvt.fxu.trunc.s1 f10 = f10 319 ;; 320 // r = q * (-b) + a 321 xma.l f10 = f10, f9, f14 322 ;; 323 // Transfer result to GP registers. 324 getf.sig ret0 = f10 325 br.ret.sptk rp 326 ;; 327 .endp __umoddi3 328#endif 329 330#ifdef L__divsi3 331// Compute a 32-bit integer quotient. 332// 333// From the Intel IA-64 Optimization Guide, choose the minimum latency 334// alternative. 335// 336// in0 holds the dividend. in1 holds the divisor. 337 338 .text 339 .align 16 340 .global __divsi3 341 .proc __divsi3 342__divsi3: 343 .regstk 2,0,0,0 344 sxt4 in0 = in0 345 sxt4 in1 = in1 346 ;; 347 setf.sig f8 = in0 348 setf.sig f9 = in1 349 ;; 350 mov r2 = 0x0ffdd 351 fcvt.xf f8 = f8 352 fcvt.xf f9 = f9 353 ;; 354 setf.exp f11 = r2 355 frcpa.s1 f10, p6 = f8, f9 356 ;; 357(p6) fmpy.s1 f8 = f8, f10 358(p6) fnma.s1 f9 = f9, f10, f1 359 ;; 360(p6) fma.s1 f8 = f9, f8, f8 361(p6) fma.s1 f9 = f9, f9, f11 362 ;; 363(p6) fma.s1 f10 = f9, f8, f8 364 ;; 365 fcvt.fx.trunc.s1 f10 = f10 366 ;; 367 getf.sig ret0 = f10 368 br.ret.sptk rp 369 ;; 370 .endp __divsi3 371#endif 372 373#ifdef L__modsi3 374// Compute a 32-bit integer modulus. 375// 376// From the Intel IA-64 Optimization Guide, choose the minimum latency 377// alternative. 378// 379// in0 holds the dividend. in1 holds the divisor. 380 381 .text 382 .align 16 383 .global __modsi3 384 .proc __modsi3 385__modsi3: 386 .regstk 2,0,0,0 387 mov r2 = 0x0ffdd 388 sxt4 in0 = in0 389 sxt4 in1 = in1 390 ;; 391 setf.sig f13 = r32 392 setf.sig f9 = r33 393 ;; 394 sub in1 = r0, in1 395 fcvt.xf f8 = f13 396 fcvt.xf f9 = f9 397 ;; 398 setf.exp f11 = r2 399 frcpa.s1 f10, p6 = f8, f9 400 ;; 401(p6) fmpy.s1 f12 = f8, f10 402(p6) fnma.s1 f10 = f9, f10, f1 403 ;; 404 setf.sig f9 = in1 405(p6) fma.s1 f12 = f10, f12, f12 406(p6) fma.s1 f10 = f10, f10, f11 407 ;; 408(p6) fma.s1 f10 = f10, f12, f12 409 ;; 410 fcvt.fx.trunc.s1 f10 = f10 411 ;; 412 xma.l f10 = f10, f9, f13 413 ;; 414 getf.sig ret0 = f10 415 br.ret.sptk rp 416 ;; 417 .endp __modsi3 418#endif 419 420#ifdef L__udivsi3 421// Compute a 32-bit unsigned integer quotient. 422// 423// From the Intel IA-64 Optimization Guide, choose the minimum latency 424// alternative. 425// 426// in0 holds the dividend. in1 holds the divisor. 427 428 .text 429 .align 16 430 .global __udivsi3 431 .proc __udivsi3 432__udivsi3: 433 .regstk 2,0,0,0 434 mov r2 = 0x0ffdd 435 zxt4 in0 = in0 436 zxt4 in1 = in1 437 ;; 438 setf.sig f8 = in0 439 setf.sig f9 = in1 440 ;; 441 fcvt.xf f8 = f8 442 fcvt.xf f9 = f9 443 ;; 444 setf.exp f11 = r2 445 frcpa.s1 f10, p6 = f8, f9 446 ;; 447(p6) fmpy.s1 f8 = f8, f10 448(p6) fnma.s1 f9 = f9, f10, f1 449 ;; 450(p6) fma.s1 f8 = f9, f8, f8 451(p6) fma.s1 f9 = f9, f9, f11 452 ;; 453(p6) fma.s1 f10 = f9, f8, f8 454 ;; 455 fcvt.fxu.trunc.s1 f10 = f10 456 ;; 457 getf.sig ret0 = f10 458 br.ret.sptk rp 459 ;; 460 .endp __udivsi3 461#endif 462 463#ifdef L__umodsi3 464// Compute a 32-bit unsigned integer modulus. 465// 466// From the Intel IA-64 Optimization Guide, choose the minimum latency 467// alternative. 468// 469// in0 holds the dividend. in1 holds the divisor. 470 471 .text 472 .align 16 473 .global __umodsi3 474 .proc __umodsi3 475__umodsi3: 476 .regstk 2,0,0,0 477 mov r2 = 0x0ffdd 478 zxt4 in0 = in0 479 zxt4 in1 = in1 480 ;; 481 setf.sig f13 = in0 482 setf.sig f9 = in1 483 ;; 484 sub in1 = r0, in1 485 fcvt.xf f8 = f13 486 fcvt.xf f9 = f9 487 ;; 488 setf.exp f11 = r2 489 frcpa.s1 f10, p6 = f8, f9 490 ;; 491(p6) fmpy.s1 f12 = f8, f10 492(p6) fnma.s1 f10 = f9, f10, f1 493 ;; 494 setf.sig f9 = in1 495(p6) fma.s1 f12 = f10, f12, f12 496(p6) fma.s1 f10 = f10, f10, f11 497 ;; 498(p6) fma.s1 f10 = f10, f12, f12 499 ;; 500 fcvt.fxu.trunc.s1 f10 = f10 501 ;; 502 xma.l f10 = f10, f9, f13 503 ;; 504 getf.sig ret0 = f10 505 br.ret.sptk rp 506 ;; 507 .endp __umodsi3 508#endif 509 510#ifdef L__save_stack_nonlocal 511// Notes on save/restore stack nonlocal: We read ar.bsp but write 512// ar.bspstore. This is because ar.bsp can be read at all times 513// (independent of the RSE mode) but since it's read-only we need to 514// restore the value via ar.bspstore. This is OK because 515// ar.bsp==ar.bspstore after executing "flushrs". 516 517// void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer) 518 519 .text 520 .align 16 521 .global __ia64_save_stack_nonlocal 522 .proc __ia64_save_stack_nonlocal 523__ia64_save_stack_nonlocal: 524 { .mmf 525 alloc r18 = ar.pfs, 2, 0, 0, 0 526 mov r19 = ar.rsc 527 ;; 528 } 529 { .mmi 530 flushrs 531 st8 [in0] = in1, 24 532 and r19 = 0x1c, r19 533 ;; 534 } 535 { .mmi 536 st8 [in0] = r18, -16 537 mov ar.rsc = r19 538 or r19 = 0x3, r19 539 ;; 540 } 541 { .mmi 542 mov r16 = ar.bsp 543 mov r17 = ar.rnat 544 adds r2 = 8, in0 545 ;; 546 } 547 { .mmi 548 st8 [in0] = r16 549 st8 [r2] = r17 550 } 551 { .mib 552 mov ar.rsc = r19 553 br.ret.sptk.few rp 554 ;; 555 } 556 .endp __ia64_save_stack_nonlocal 557#endif 558 559#ifdef L__nonlocal_goto 560// void __ia64_nonlocal_goto(void *target_label, void *save_area, 561// void *static_chain); 562 563 .text 564 .align 16 565 .global __ia64_nonlocal_goto 566 .proc __ia64_nonlocal_goto 567__ia64_nonlocal_goto: 568 { .mmi 569 alloc r20 = ar.pfs, 3, 0, 0, 0 570 ld8 r12 = [in1], 8 571 mov.ret.sptk rp = in0, .L0 572 ;; 573 } 574 { .mmf 575 ld8 r16 = [in1], 8 576 mov r19 = ar.rsc 577 ;; 578 } 579 { .mmi 580 flushrs 581 ld8 r17 = [in1], 8 582 and r19 = 0x1c, r19 583 ;; 584 } 585 { .mmi 586 ld8 r18 = [in1] 587 mov ar.rsc = r19 588 or r19 = 0x3, r19 589 ;; 590 } 591 { .mmi 592 mov ar.bspstore = r16 593 ;; 594 mov ar.rnat = r17 595 ;; 596 } 597 { .mmi 598 loadrs 599 invala 600 mov r15 = in2 601 ;; 602 } 603.L0: { .mib 604 mov ar.rsc = r19 605 mov ar.pfs = r18 606 br.ret.sptk.few rp 607 ;; 608 } 609 .endp __ia64_nonlocal_goto 610#endif 611 612#ifdef L__restore_stack_nonlocal 613// This is mostly the same as nonlocal_goto above. 614// ??? This has not been tested yet. 615 616// void __ia64_restore_stack_nonlocal(void *save_area) 617 618 .text 619 .align 16 620 .global __ia64_restore_stack_nonlocal 621 .proc __ia64_restore_stack_nonlocal 622__ia64_restore_stack_nonlocal: 623 { .mmf 624 alloc r20 = ar.pfs, 4, 0, 0, 0 625 ld8 r12 = [in0], 8 626 ;; 627 } 628 { .mmb 629 ld8 r16=[in0], 8 630 mov r19 = ar.rsc 631 ;; 632 } 633 { .mmi 634 flushrs 635 ld8 r17 = [in0], 8 636 and r19 = 0x1c, r19 637 ;; 638 } 639 { .mmf 640 ld8 r18 = [in0] 641 mov ar.rsc = r19 642 ;; 643 } 644 { .mmi 645 mov ar.bspstore = r16 646 ;; 647 mov ar.rnat = r17 648 or r19 = 0x3, r19 649 ;; 650 } 651 { .mmf 652 loadrs 653 invala 654 ;; 655 } 656.L0: { .mib 657 mov ar.rsc = r19 658 mov ar.pfs = r18 659 br.ret.sptk.few rp 660 ;; 661 } 662 .endp __ia64_restore_stack_nonlocal 663#endif 664 665#ifdef L__trampoline 666// Implement the nested function trampoline. This is out of line 667// so that we don't have to bother with flushing the icache, as 668// well as making the on-stack trampoline smaller. 669// 670// The trampoline has the following form: 671// 672// +-------------------+ > 673// TRAMP: | __ia64_trampoline | | 674// +-------------------+ > fake function descriptor 675// | TRAMP+16 | | 676// +-------------------+ > 677// | target descriptor | 678// +-------------------+ 679// | static link | 680// +-------------------+ 681 682 .text 683 .align 16 684 .global __ia64_trampoline 685 .proc __ia64_trampoline 686__ia64_trampoline: 687 { .mmi 688 ld8 r2 = [r1], 8 689 ;; 690 ld8 r15 = [r1] 691 } 692 { .mmi 693 ld8 r3 = [r2], 8 694 ;; 695 ld8 r1 = [r2] 696 mov b6 = r3 697 } 698 { .bbb 699 br.sptk.many b6 700 ;; 701 } 702 .endp __ia64_trampoline 703#endif 704