rsaz-avx2.pl (290207) | rsaz-avx2.pl (296279) |
---|---|
1#!/usr/bin/env perl 2 3############################################################################## 4# # 5# Copyright (c) 2012, Intel Corporation # 6# # 7# All rights reserved. # 8# # --- 429 unchanged lines hidden (view full) --- 438 jnz .LOOP_SQR_1024 439___ 440$ZERO = $ACC9; 441$TEMP0 = $B1; 442$TEMP2 = $B2; 443$TEMP3 = $Y1; 444$TEMP4 = $Y2; 445$code.=<<___; | 1#!/usr/bin/env perl 2 3############################################################################## 4# # 5# Copyright (c) 2012, Intel Corporation # 6# # 7# All rights reserved. # 8# # --- 429 unchanged lines hidden (view full) --- 438 jnz .LOOP_SQR_1024 439___ 440$ZERO = $ACC9; 441$TEMP0 = $B1; 442$TEMP2 = $B2; 443$TEMP3 = $Y1; 444$TEMP4 = $Y2; 445$code.=<<___; |
446 #we need to fix indexes 32-39 to avoid overflow | 446 # we need to fix indices 32-39 to avoid overflow |
447 vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0), 448 vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0) 449 vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0) 450 lea 192(%rsp), $tp0 # 64+128=192 451 452 vpsrlq \$29, $ACC8, $TEMP1 453 vpand $AND_MASK, $ACC8, $ACC8 454 vpsrlq \$29, $ACC1, $TEMP2 --- 1132 unchanged lines hidden (view full) --- 1587 vzeroupper 1588 ret 1589.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2 1590 1591.globl rsaz_1024_gather5_avx2 1592.type rsaz_1024_gather5_avx2,\@abi-omnipotent 1593.align 32 1594rsaz_1024_gather5_avx2: | 447 vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0), 448 vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0) 449 vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0) 450 lea 192(%rsp), $tp0 # 64+128=192 451 452 vpsrlq \$29, $ACC8, $TEMP1 453 vpand $AND_MASK, $ACC8, $ACC8 454 vpsrlq \$29, $ACC1, $TEMP2 --- 1132 unchanged lines hidden (view full) --- 1587 vzeroupper 1588 ret 1589.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2 1590 1591.globl rsaz_1024_gather5_avx2 1592.type rsaz_1024_gather5_avx2,\@abi-omnipotent 1593.align 32 1594rsaz_1024_gather5_avx2: |
1595 vzeroupper 1596 mov %rsp,%r11 |
|
1595___ 1596$code.=<<___ if ($win64); 1597 lea -0x88(%rsp),%rax | 1597___ 1598$code.=<<___ if ($win64); 1599 lea -0x88(%rsp),%rax |
1598 vzeroupper | |
1599.LSEH_begin_rsaz_1024_gather5: 1600 # I can't trust assembler to use specific encoding:-( | 1600.LSEH_begin_rsaz_1024_gather5: 1601 # I can't trust assembler to use specific encoding:-( |
1601 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp 1602 .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6,-0x20(%rax) 1603 .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7,-0x10(%rax) 1604 .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8,0(%rax) 1605 .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9,0x10(%rax) 1606 .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10,0x20(%rax) 1607 .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11,0x30(%rax) 1608 .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12,0x40(%rax) 1609 .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13,0x50(%rax) 1610 .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14,0x60(%rax) 1611 .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15,0x70(%rax) | 1602 .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax),%rsp 1603 .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6,-0x20(%rax) 1604 .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7,-0x10(%rax) 1605 .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8,0(%rax) 1606 .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9,0x10(%rax) 1607 .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10,0x20(%rax) 1608 .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11,0x30(%rax) 1609 .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12,0x40(%rax) 1610 .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13,0x50(%rax) 1611 .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14,0x60(%rax) 1612 .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15,0x70(%rax) |
1612___ 1613$code.=<<___; | 1613___ 1614$code.=<<___; |
1614 lea .Lgather_table(%rip),%r11 1615 mov $power,%eax 1616 and \$3,$power 1617 shr \$2,%eax # cache line number 1618 shl \$4,$power # offset within cache line | 1615 lea -0x100(%rsp),%rsp 1616 and \$-32, %rsp 1617 lea .Linc(%rip), %r10 1618 lea -128(%rsp),%rax # control u-op density |
1619 | 1619 |
1620 vmovdqu -32(%r11),%ymm7 # .Lgather_permd 1621 vpbroadcastb 8(%r11,%rax), %xmm8 1622 vpbroadcastb 7(%r11,%rax), %xmm9 1623 vpbroadcastb 6(%r11,%rax), %xmm10 1624 vpbroadcastb 5(%r11,%rax), %xmm11 1625 vpbroadcastb 4(%r11,%rax), %xmm12 1626 vpbroadcastb 3(%r11,%rax), %xmm13 1627 vpbroadcastb 2(%r11,%rax), %xmm14 1628 vpbroadcastb 1(%r11,%rax), %xmm15 | 1620 vmovd $power, %xmm4 1621 vmovdqa (%r10),%ymm0 1622 vmovdqa 32(%r10),%ymm1 1623 vmovdqa 64(%r10),%ymm5 1624 vpbroadcastd %xmm4,%ymm4 |
1629 | 1625 |
1630 lea 64($inp,$power),$inp 1631 mov \$64,%r11 # size optimization 1632 mov \$9,%eax 1633 jmp .Loop_gather_1024 | 1626 vpaddd %ymm5, %ymm0, %ymm2 1627 vpcmpeqd %ymm4, %ymm0, %ymm0 1628 vpaddd %ymm5, %ymm1, %ymm3 1629 vpcmpeqd %ymm4, %ymm1, %ymm1 1630 vmovdqa %ymm0, 32*0+128(%rax) 1631 vpaddd %ymm5, %ymm2, %ymm0 1632 vpcmpeqd %ymm4, %ymm2, %ymm2 1633 vmovdqa %ymm1, 32*1+128(%rax) 1634 vpaddd %ymm5, %ymm3, %ymm1 1635 vpcmpeqd %ymm4, %ymm3, %ymm3 1636 vmovdqa %ymm2, 32*2+128(%rax) 1637 vpaddd %ymm5, %ymm0, %ymm2 1638 vpcmpeqd %ymm4, %ymm0, %ymm0 1639 vmovdqa %ymm3, 32*3+128(%rax) 1640 vpaddd %ymm5, %ymm1, %ymm3 1641 vpcmpeqd %ymm4, %ymm1, %ymm1 1642 vmovdqa %ymm0, 32*4+128(%rax) 1643 vpaddd %ymm5, %ymm2, %ymm8 1644 vpcmpeqd %ymm4, %ymm2, %ymm2 1645 vmovdqa %ymm1, 32*5+128(%rax) 1646 vpaddd %ymm5, %ymm3, %ymm9 1647 vpcmpeqd %ymm4, %ymm3, %ymm3 1648 vmovdqa %ymm2, 32*6+128(%rax) 1649 vpaddd %ymm5, %ymm8, %ymm10 1650 vpcmpeqd %ymm4, %ymm8, %ymm8 1651 vmovdqa %ymm3, 32*7+128(%rax) 1652 vpaddd %ymm5, %ymm9, %ymm11 1653 vpcmpeqd %ymm4, %ymm9, %ymm9 1654 vpaddd %ymm5, %ymm10, %ymm12 1655 vpcmpeqd %ymm4, %ymm10, %ymm10 1656 vpaddd %ymm5, %ymm11, %ymm13 1657 vpcmpeqd %ymm4, %ymm11, %ymm11 1658 vpaddd %ymm5, %ymm12, %ymm14 1659 vpcmpeqd %ymm4, %ymm12, %ymm12 1660 vpaddd %ymm5, %ymm13, %ymm15 1661 vpcmpeqd %ymm4, %ymm13, %ymm13 1662 vpcmpeqd %ymm4, %ymm14, %ymm14 1663 vpcmpeqd %ymm4, %ymm15, %ymm15 |
1634 | 1664 |
1635.align 32 | 1665 vmovdqa -32(%r10),%ymm7 # .Lgather_permd 1666 lea 128($inp), $inp 1667 mov \$9,$power 1668 |
1636.Loop_gather_1024: | 1669.Loop_gather_1024: |
1637 vpand -64($inp), %xmm8,%xmm0 1638 vpand ($inp), %xmm9,%xmm1 1639 vpand 64($inp), %xmm10,%xmm2 1640 vpand ($inp,%r11,2), %xmm11,%xmm3 1641 vpor %xmm0,%xmm1,%xmm1 1642 vpand 64($inp,%r11,2), %xmm12,%xmm4 1643 vpor %xmm2,%xmm3,%xmm3 1644 vpand ($inp,%r11,4), %xmm13,%xmm5 1645 vpor %xmm1,%xmm3,%xmm3 1646 vpand 64($inp,%r11,4), %xmm14,%xmm6 1647 vpor %xmm4,%xmm5,%xmm5 1648 vpand -128($inp,%r11,8), %xmm15,%xmm2 1649 lea ($inp,%r11,8),$inp 1650 vpor %xmm3,%xmm5,%xmm5 1651 vpor %xmm2,%xmm6,%xmm6 1652 vpor %xmm5,%xmm6,%xmm6 1653 vpermd %ymm6,%ymm7,%ymm6 1654 vmovdqu %ymm6,($out) | 1670 vmovdqa 32*0-128($inp), %ymm0 1671 vmovdqa 32*1-128($inp), %ymm1 1672 vmovdqa 32*2-128($inp), %ymm2 1673 vmovdqa 32*3-128($inp), %ymm3 1674 vpand 32*0+128(%rax), %ymm0, %ymm0 1675 vpand 32*1+128(%rax), %ymm1, %ymm1 1676 vpand 32*2+128(%rax), %ymm2, %ymm2 1677 vpor %ymm0, %ymm1, %ymm4 1678 vpand 32*3+128(%rax), %ymm3, %ymm3 1679 vmovdqa 32*4-128($inp), %ymm0 1680 vmovdqa 32*5-128($inp), %ymm1 1681 vpor %ymm2, %ymm3, %ymm5 1682 vmovdqa 32*6-128($inp), %ymm2 1683 vmovdqa 32*7-128($inp), %ymm3 1684 vpand 32*4+128(%rax), %ymm0, %ymm0 1685 vpand 32*5+128(%rax), %ymm1, %ymm1 1686 vpand 32*6+128(%rax), %ymm2, %ymm2 1687 vpor %ymm0, %ymm4, %ymm4 1688 vpand 32*7+128(%rax), %ymm3, %ymm3 1689 vpand 32*8-128($inp), %ymm8, %ymm0 1690 vpor %ymm1, %ymm5, %ymm5 1691 vpand 32*9-128($inp), %ymm9, %ymm1 1692 vpor %ymm2, %ymm4, %ymm4 1693 vpand 32*10-128($inp),%ymm10, %ymm2 1694 vpor %ymm3, %ymm5, %ymm5 1695 vpand 32*11-128($inp),%ymm11, %ymm3 1696 vpor %ymm0, %ymm4, %ymm4 1697 vpand 32*12-128($inp),%ymm12, %ymm0 1698 vpor %ymm1, %ymm5, %ymm5 1699 vpand 32*13-128($inp),%ymm13, %ymm1 1700 vpor %ymm2, %ymm4, %ymm4 1701 vpand 32*14-128($inp),%ymm14, %ymm2 1702 vpor %ymm3, %ymm5, %ymm5 1703 vpand 32*15-128($inp),%ymm15, %ymm3 1704 lea 32*16($inp), $inp 1705 vpor %ymm0, %ymm4, %ymm4 1706 vpor %ymm1, %ymm5, %ymm5 1707 vpor %ymm2, %ymm4, %ymm4 1708 vpor %ymm3, %ymm5, %ymm5 1709 1710 vpor %ymm5, %ymm4, %ymm4 1711 vextracti128 \$1, %ymm4, %xmm5 # upper half is cleared 1712 vpor %xmm4, %xmm5, %xmm5 1713 vpermd %ymm5,%ymm7,%ymm5 1714 vmovdqu %ymm5,($out) |
1655 lea 32($out),$out | 1715 lea 32($out),$out |
1656 dec %eax | 1716 dec $power |
1657 jnz .Loop_gather_1024 1658 1659 vpxor %ymm0,%ymm0,%ymm0 1660 vmovdqu %ymm0,($out) 1661 vzeroupper 1662___ 1663$code.=<<___ if ($win64); | 1717 jnz .Loop_gather_1024 1718 1719 vpxor %ymm0,%ymm0,%ymm0 1720 vmovdqu %ymm0,($out) 1721 vzeroupper 1722___ 1723$code.=<<___ if ($win64); |
1664 movaps (%rsp),%xmm6 1665 movaps 0x10(%rsp),%xmm7 1666 movaps 0x20(%rsp),%xmm8 1667 movaps 0x30(%rsp),%xmm9 1668 movaps 0x40(%rsp),%xmm10 1669 movaps 0x50(%rsp),%xmm11 1670 movaps 0x60(%rsp),%xmm12 1671 movaps 0x70(%rsp),%xmm13 1672 movaps 0x80(%rsp),%xmm14 1673 movaps 0x90(%rsp),%xmm15 1674 lea 0xa8(%rsp),%rsp | 1724 movaps -0xa8(%r11),%xmm6 1725 movaps -0x98(%r11),%xmm7 1726 movaps -0x88(%r11),%xmm8 1727 movaps -0x78(%r11),%xmm9 1728 movaps -0x68(%r11),%xmm10 1729 movaps -0x58(%r11),%xmm11 1730 movaps -0x48(%r11),%xmm12 1731 movaps -0x38(%r11),%xmm13 1732 movaps -0x28(%r11),%xmm14 1733 movaps -0x18(%r11),%xmm15 |
1675.LSEH_end_rsaz_1024_gather5: 1676___ 1677$code.=<<___; | 1734.LSEH_end_rsaz_1024_gather5: 1735___ 1736$code.=<<___; |
1737 lea (%r11),%rsp |
|
1678 ret 1679.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 1680___ 1681} 1682 1683$code.=<<___; 1684.extern OPENSSL_ia32cap_P 1685.globl rsaz_avx2_eligible --- 17 unchanged lines hidden (view full) --- 1703 1704.align 64 1705.Land_mask: 1706 .quad 0x1fffffff,0x1fffffff,0x1fffffff,-1 1707.Lscatter_permd: 1708 .long 0,2,4,6,7,7,7,7 1709.Lgather_permd: 1710 .long 0,7,1,7,2,7,3,7 | 1738 ret 1739.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 1740___ 1741} 1742 1743$code.=<<___; 1744.extern OPENSSL_ia32cap_P 1745.globl rsaz_avx2_eligible --- 17 unchanged lines hidden (view full) --- 1763 1764.align 64 1765.Land_mask: 1766 .quad 0x1fffffff,0x1fffffff,0x1fffffff,-1 1767.Lscatter_permd: 1768 .long 0,2,4,6,7,7,7,7 1769.Lgather_permd: 1770 .long 0,7,1,7,2,7,3,7 |
1711.Lgather_table: 1712 .byte 0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0 | 1771.Linc: 1772 .long 0,0,0,0, 1,1,1,1 1773 .long 2,2,2,2, 3,3,3,3 1774 .long 4,4,4,4, 4,4,4,4 |
1713.align 64 1714___ 1715 1716if ($win64) { 1717$rec="%rcx"; 1718$frame="%rdx"; 1719$context="%r8"; 1720$disp="%r9"; --- 111 unchanged lines hidden (view full) --- 1832 .byte 9,0,0,0 1833 .rva rsaz_se_handler 1834 .rva .Lsqr_1024_body,.Lsqr_1024_epilogue 1835.LSEH_info_rsaz_1024_mul_avx2: 1836 .byte 9,0,0,0 1837 .rva rsaz_se_handler 1838 .rva .Lmul_1024_body,.Lmul_1024_epilogue 1839.LSEH_info_rsaz_1024_gather5: | 1775.align 64 1776___ 1777 1778if ($win64) { 1779$rec="%rcx"; 1780$frame="%rdx"; 1781$context="%r8"; 1782$disp="%r9"; --- 111 unchanged lines hidden (view full) --- 1894 .byte 9,0,0,0 1895 .rva rsaz_se_handler 1896 .rva .Lsqr_1024_body,.Lsqr_1024_epilogue 1897.LSEH_info_rsaz_1024_mul_avx2: 1898 .byte 9,0,0,0 1899 .rva rsaz_se_handler 1900 .rva .Lmul_1024_body,.Lmul_1024_epilogue 1901.LSEH_info_rsaz_1024_gather5: |
1840 .byte 0x01,0x33,0x16,0x00 1841 .byte 0x36,0xf8,0x09,0x00 #vmovaps 0x90(rsp),xmm15 1842 .byte 0x31,0xe8,0x08,0x00 #vmovaps 0x80(rsp),xmm14 1843 .byte 0x2c,0xd8,0x07,0x00 #vmovaps 0x70(rsp),xmm13 1844 .byte 0x27,0xc8,0x06,0x00 #vmovaps 0x60(rsp),xmm12 1845 .byte 0x22,0xb8,0x05,0x00 #vmovaps 0x50(rsp),xmm11 1846 .byte 0x1d,0xa8,0x04,0x00 #vmovaps 0x40(rsp),xmm10 1847 .byte 0x18,0x98,0x03,0x00 #vmovaps 0x30(rsp),xmm9 1848 .byte 0x13,0x88,0x02,0x00 #vmovaps 0x20(rsp),xmm8 1849 .byte 0x0e,0x78,0x01,0x00 #vmovaps 0x10(rsp),xmm7 1850 .byte 0x09,0x68,0x00,0x00 #vmovaps 0x00(rsp),xmm6 1851 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 | 1902 .byte 0x01,0x36,0x17,0x0b 1903 .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 1904 .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 1905 .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 1906 .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 1907 .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 1908 .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 1909 .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 1910 .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 1911 .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 1912 .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 1913 .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8 1914 .byte 0x00,0xb3,0x00,0x00 # set_frame r11 |
1852___ 1853} 1854 1855foreach (split("\n",$code)) { 1856 s/\`([^\`]*)\`/eval($1)/ge; 1857 1858 s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or 1859 --- 39 unchanged lines hidden --- | 1915___ 1916} 1917 1918foreach (split("\n",$code)) { 1919 s/\`([^\`]*)\`/eval($1)/ge; 1920 1921 s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or 1922 --- 39 unchanged lines hidden --- |