Deleted Added
full compact
rsaz-avx2.pl (290207) rsaz-avx2.pl (296279)
1#!/usr/bin/env perl
2
3##############################################################################
4# #
5# Copyright (c) 2012, Intel Corporation #
6# #
7# All rights reserved. #
8# #

--- 429 unchanged lines hidden (view full) ---

438 jnz .LOOP_SQR_1024
439___
440$ZERO = $ACC9;
441$TEMP0 = $B1;
442$TEMP2 = $B2;
443$TEMP3 = $Y1;
444$TEMP4 = $Y2;
445$code.=<<___;
1#!/usr/bin/env perl
2
3##############################################################################
4# #
5# Copyright (c) 2012, Intel Corporation #
6# #
7# All rights reserved. #
8# #

--- 429 unchanged lines hidden (view full) ---

438 jnz .LOOP_SQR_1024
439___
440$ZERO = $ACC9;
441$TEMP0 = $B1;
442$TEMP2 = $B2;
443$TEMP3 = $Y1;
444$TEMP4 = $Y2;
445$code.=<<___;
446 #we need to fix indexes 32-39 to avoid overflow
446 # we need to fix indices 32-39 to avoid overflow
447 vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0),
448 vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0)
449 vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0)
450 lea 192(%rsp), $tp0 # 64+128=192
451
452 vpsrlq \$29, $ACC8, $TEMP1
453 vpand $AND_MASK, $ACC8, $ACC8
454 vpsrlq \$29, $ACC1, $TEMP2

--- 1132 unchanged lines hidden (view full) ---

1587 vzeroupper
1588 ret
1589.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
1590
1591.globl rsaz_1024_gather5_avx2
1592.type rsaz_1024_gather5_avx2,\@abi-omnipotent
1593.align 32
1594rsaz_1024_gather5_avx2:
447 vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0),
448 vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0)
449 vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0)
450 lea 192(%rsp), $tp0 # 64+128=192
451
452 vpsrlq \$29, $ACC8, $TEMP1
453 vpand $AND_MASK, $ACC8, $ACC8
454 vpsrlq \$29, $ACC1, $TEMP2

--- 1132 unchanged lines hidden (view full) ---

1587 vzeroupper
1588 ret
1589.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
1590
1591.globl rsaz_1024_gather5_avx2
1592.type rsaz_1024_gather5_avx2,\@abi-omnipotent
1593.align 32
1594rsaz_1024_gather5_avx2:
1595 vzeroupper
1596 mov %rsp,%r11
1595___
1596$code.=<<___ if ($win64);
1597 lea -0x88(%rsp),%rax
1597___
1598$code.=<<___ if ($win64);
1599 lea -0x88(%rsp),%rax
1598 vzeroupper
1599.LSEH_begin_rsaz_1024_gather5:
1600 # I can't trust assembler to use specific encoding:-(
1600.LSEH_begin_rsaz_1024_gather5:
1601 # I can't trust assembler to use specific encoding:-(
1601 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
1602 .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6,-0x20(%rax)
1603 .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7,-0x10(%rax)
1604 .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8,0(%rax)
1605 .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9,0x10(%rax)
1606 .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10,0x20(%rax)
1607 .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11,0x30(%rax)
1608 .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12,0x40(%rax)
1609 .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13,0x50(%rax)
1610 .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14,0x60(%rax)
1611 .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15,0x70(%rax)
1602 .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax),%rsp
1603 .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6,-0x20(%rax)
1604 .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7,-0x10(%rax)
1605 .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8,0(%rax)
1606 .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9,0x10(%rax)
1607 .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10,0x20(%rax)
1608 .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11,0x30(%rax)
1609 .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12,0x40(%rax)
1610 .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13,0x50(%rax)
1611 .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14,0x60(%rax)
1612 .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15,0x70(%rax)
1612___
1613$code.=<<___;
1613___
1614$code.=<<___;
1614 lea .Lgather_table(%rip),%r11
1615 mov $power,%eax
1616 and \$3,$power
1617 shr \$2,%eax # cache line number
1618 shl \$4,$power # offset within cache line
1615 lea -0x100(%rsp),%rsp
1616 and \$-32, %rsp
1617 lea .Linc(%rip), %r10
1618 lea -128(%rsp),%rax # control u-op density
1619
1619
1620 vmovdqu -32(%r11),%ymm7 # .Lgather_permd
1621 vpbroadcastb 8(%r11,%rax), %xmm8
1622 vpbroadcastb 7(%r11,%rax), %xmm9
1623 vpbroadcastb 6(%r11,%rax), %xmm10
1624 vpbroadcastb 5(%r11,%rax), %xmm11
1625 vpbroadcastb 4(%r11,%rax), %xmm12
1626 vpbroadcastb 3(%r11,%rax), %xmm13
1627 vpbroadcastb 2(%r11,%rax), %xmm14
1628 vpbroadcastb 1(%r11,%rax), %xmm15
1620 vmovd $power, %xmm4
1621 vmovdqa (%r10),%ymm0
1622 vmovdqa 32(%r10),%ymm1
1623 vmovdqa 64(%r10),%ymm5
1624 vpbroadcastd %xmm4,%ymm4
1629
1625
1630 lea 64($inp,$power),$inp
1631 mov \$64,%r11 # size optimization
1632 mov \$9,%eax
1633 jmp .Loop_gather_1024
1626 vpaddd %ymm5, %ymm0, %ymm2
1627 vpcmpeqd %ymm4, %ymm0, %ymm0
1628 vpaddd %ymm5, %ymm1, %ymm3
1629 vpcmpeqd %ymm4, %ymm1, %ymm1
1630 vmovdqa %ymm0, 32*0+128(%rax)
1631 vpaddd %ymm5, %ymm2, %ymm0
1632 vpcmpeqd %ymm4, %ymm2, %ymm2
1633 vmovdqa %ymm1, 32*1+128(%rax)
1634 vpaddd %ymm5, %ymm3, %ymm1
1635 vpcmpeqd %ymm4, %ymm3, %ymm3
1636 vmovdqa %ymm2, 32*2+128(%rax)
1637 vpaddd %ymm5, %ymm0, %ymm2
1638 vpcmpeqd %ymm4, %ymm0, %ymm0
1639 vmovdqa %ymm3, 32*3+128(%rax)
1640 vpaddd %ymm5, %ymm1, %ymm3
1641 vpcmpeqd %ymm4, %ymm1, %ymm1
1642 vmovdqa %ymm0, 32*4+128(%rax)
1643 vpaddd %ymm5, %ymm2, %ymm8
1644 vpcmpeqd %ymm4, %ymm2, %ymm2
1645 vmovdqa %ymm1, 32*5+128(%rax)
1646 vpaddd %ymm5, %ymm3, %ymm9
1647 vpcmpeqd %ymm4, %ymm3, %ymm3
1648 vmovdqa %ymm2, 32*6+128(%rax)
1649 vpaddd %ymm5, %ymm8, %ymm10
1650 vpcmpeqd %ymm4, %ymm8, %ymm8
1651 vmovdqa %ymm3, 32*7+128(%rax)
1652 vpaddd %ymm5, %ymm9, %ymm11
1653 vpcmpeqd %ymm4, %ymm9, %ymm9
1654 vpaddd %ymm5, %ymm10, %ymm12
1655 vpcmpeqd %ymm4, %ymm10, %ymm10
1656 vpaddd %ymm5, %ymm11, %ymm13
1657 vpcmpeqd %ymm4, %ymm11, %ymm11
1658 vpaddd %ymm5, %ymm12, %ymm14
1659 vpcmpeqd %ymm4, %ymm12, %ymm12
1660 vpaddd %ymm5, %ymm13, %ymm15
1661 vpcmpeqd %ymm4, %ymm13, %ymm13
1662 vpcmpeqd %ymm4, %ymm14, %ymm14
1663 vpcmpeqd %ymm4, %ymm15, %ymm15
1634
1664
1635.align 32
1665 vmovdqa -32(%r10),%ymm7 # .Lgather_permd
1666 lea 128($inp), $inp
1667 mov \$9,$power
1668
1636.Loop_gather_1024:
1669.Loop_gather_1024:
1637 vpand -64($inp), %xmm8,%xmm0
1638 vpand ($inp), %xmm9,%xmm1
1639 vpand 64($inp), %xmm10,%xmm2
1640 vpand ($inp,%r11,2), %xmm11,%xmm3
1641 vpor %xmm0,%xmm1,%xmm1
1642 vpand 64($inp,%r11,2), %xmm12,%xmm4
1643 vpor %xmm2,%xmm3,%xmm3
1644 vpand ($inp,%r11,4), %xmm13,%xmm5
1645 vpor %xmm1,%xmm3,%xmm3
1646 vpand 64($inp,%r11,4), %xmm14,%xmm6
1647 vpor %xmm4,%xmm5,%xmm5
1648 vpand -128($inp,%r11,8), %xmm15,%xmm2
1649 lea ($inp,%r11,8),$inp
1650 vpor %xmm3,%xmm5,%xmm5
1651 vpor %xmm2,%xmm6,%xmm6
1652 vpor %xmm5,%xmm6,%xmm6
1653 vpermd %ymm6,%ymm7,%ymm6
1654 vmovdqu %ymm6,($out)
1670 vmovdqa 32*0-128($inp), %ymm0
1671 vmovdqa 32*1-128($inp), %ymm1
1672 vmovdqa 32*2-128($inp), %ymm2
1673 vmovdqa 32*3-128($inp), %ymm3
1674 vpand 32*0+128(%rax), %ymm0, %ymm0
1675 vpand 32*1+128(%rax), %ymm1, %ymm1
1676 vpand 32*2+128(%rax), %ymm2, %ymm2
1677 vpor %ymm0, %ymm1, %ymm4
1678 vpand 32*3+128(%rax), %ymm3, %ymm3
1679 vmovdqa 32*4-128($inp), %ymm0
1680 vmovdqa 32*5-128($inp), %ymm1
1681 vpor %ymm2, %ymm3, %ymm5
1682 vmovdqa 32*6-128($inp), %ymm2
1683 vmovdqa 32*7-128($inp), %ymm3
1684 vpand 32*4+128(%rax), %ymm0, %ymm0
1685 vpand 32*5+128(%rax), %ymm1, %ymm1
1686 vpand 32*6+128(%rax), %ymm2, %ymm2
1687 vpor %ymm0, %ymm4, %ymm4
1688 vpand 32*7+128(%rax), %ymm3, %ymm3
1689 vpand 32*8-128($inp), %ymm8, %ymm0
1690 vpor %ymm1, %ymm5, %ymm5
1691 vpand 32*9-128($inp), %ymm9, %ymm1
1692 vpor %ymm2, %ymm4, %ymm4
1693 vpand 32*10-128($inp),%ymm10, %ymm2
1694 vpor %ymm3, %ymm5, %ymm5
1695 vpand 32*11-128($inp),%ymm11, %ymm3
1696 vpor %ymm0, %ymm4, %ymm4
1697 vpand 32*12-128($inp),%ymm12, %ymm0
1698 vpor %ymm1, %ymm5, %ymm5
1699 vpand 32*13-128($inp),%ymm13, %ymm1
1700 vpor %ymm2, %ymm4, %ymm4
1701 vpand 32*14-128($inp),%ymm14, %ymm2
1702 vpor %ymm3, %ymm5, %ymm5
1703 vpand 32*15-128($inp),%ymm15, %ymm3
1704 lea 32*16($inp), $inp
1705 vpor %ymm0, %ymm4, %ymm4
1706 vpor %ymm1, %ymm5, %ymm5
1707 vpor %ymm2, %ymm4, %ymm4
1708 vpor %ymm3, %ymm5, %ymm5
1709
1710 vpor %ymm5, %ymm4, %ymm4
1711 vextracti128 \$1, %ymm4, %xmm5 # upper half is cleared
1712 vpor %xmm4, %xmm5, %xmm5
1713 vpermd %ymm5,%ymm7,%ymm5
1714 vmovdqu %ymm5,($out)
1655 lea 32($out),$out
1715 lea 32($out),$out
1656 dec %eax
1716 dec $power
1657 jnz .Loop_gather_1024
1658
1659 vpxor %ymm0,%ymm0,%ymm0
1660 vmovdqu %ymm0,($out)
1661 vzeroupper
1662___
1663$code.=<<___ if ($win64);
1717 jnz .Loop_gather_1024
1718
1719 vpxor %ymm0,%ymm0,%ymm0
1720 vmovdqu %ymm0,($out)
1721 vzeroupper
1722___
1723$code.=<<___ if ($win64);
1664 movaps (%rsp),%xmm6
1665 movaps 0x10(%rsp),%xmm7
1666 movaps 0x20(%rsp),%xmm8
1667 movaps 0x30(%rsp),%xmm9
1668 movaps 0x40(%rsp),%xmm10
1669 movaps 0x50(%rsp),%xmm11
1670 movaps 0x60(%rsp),%xmm12
1671 movaps 0x70(%rsp),%xmm13
1672 movaps 0x80(%rsp),%xmm14
1673 movaps 0x90(%rsp),%xmm15
1674 lea 0xa8(%rsp),%rsp
1724 movaps -0xa8(%r11),%xmm6
1725 movaps -0x98(%r11),%xmm7
1726 movaps -0x88(%r11),%xmm8
1727 movaps -0x78(%r11),%xmm9
1728 movaps -0x68(%r11),%xmm10
1729 movaps -0x58(%r11),%xmm11
1730 movaps -0x48(%r11),%xmm12
1731 movaps -0x38(%r11),%xmm13
1732 movaps -0x28(%r11),%xmm14
1733 movaps -0x18(%r11),%xmm15
1675.LSEH_end_rsaz_1024_gather5:
1676___
1677$code.=<<___;
1734.LSEH_end_rsaz_1024_gather5:
1735___
1736$code.=<<___;
1737 lea (%r11),%rsp
1678 ret
1679.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
1680___
1681}
1682
1683$code.=<<___;
1684.extern OPENSSL_ia32cap_P
1685.globl rsaz_avx2_eligible

--- 17 unchanged lines hidden (view full) ---

1703
1704.align 64
1705.Land_mask:
1706 .quad 0x1fffffff,0x1fffffff,0x1fffffff,-1
1707.Lscatter_permd:
1708 .long 0,2,4,6,7,7,7,7
1709.Lgather_permd:
1710 .long 0,7,1,7,2,7,3,7
1738 ret
1739.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
1740___
1741}
1742
1743$code.=<<___;
1744.extern OPENSSL_ia32cap_P
1745.globl rsaz_avx2_eligible

--- 17 unchanged lines hidden (view full) ---

1763
1764.align 64
1765.Land_mask:
1766 .quad 0x1fffffff,0x1fffffff,0x1fffffff,-1
1767.Lscatter_permd:
1768 .long 0,2,4,6,7,7,7,7
1769.Lgather_permd:
1770 .long 0,7,1,7,2,7,3,7
1711.Lgather_table:
1712 .byte 0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0
1771.Linc:
1772 .long 0,0,0,0, 1,1,1,1
1773 .long 2,2,2,2, 3,3,3,3
1774 .long 4,4,4,4, 4,4,4,4
1713.align 64
1714___
1715
1716if ($win64) {
1717$rec="%rcx";
1718$frame="%rdx";
1719$context="%r8";
1720$disp="%r9";

--- 111 unchanged lines hidden (view full) ---

1832 .byte 9,0,0,0
1833 .rva rsaz_se_handler
1834 .rva .Lsqr_1024_body,.Lsqr_1024_epilogue
1835.LSEH_info_rsaz_1024_mul_avx2:
1836 .byte 9,0,0,0
1837 .rva rsaz_se_handler
1838 .rva .Lmul_1024_body,.Lmul_1024_epilogue
1839.LSEH_info_rsaz_1024_gather5:
1775.align 64
1776___
1777
1778if ($win64) {
1779$rec="%rcx";
1780$frame="%rdx";
1781$context="%r8";
1782$disp="%r9";

--- 111 unchanged lines hidden (view full) ---

1894 .byte 9,0,0,0
1895 .rva rsaz_se_handler
1896 .rva .Lsqr_1024_body,.Lsqr_1024_epilogue
1897.LSEH_info_rsaz_1024_mul_avx2:
1898 .byte 9,0,0,0
1899 .rva rsaz_se_handler
1900 .rva .Lmul_1024_body,.Lmul_1024_epilogue
1901.LSEH_info_rsaz_1024_gather5:
1840 .byte 0x01,0x33,0x16,0x00
1841 .byte 0x36,0xf8,0x09,0x00 #vmovaps 0x90(rsp),xmm15
1842 .byte 0x31,0xe8,0x08,0x00 #vmovaps 0x80(rsp),xmm14
1843 .byte 0x2c,0xd8,0x07,0x00 #vmovaps 0x70(rsp),xmm13
1844 .byte 0x27,0xc8,0x06,0x00 #vmovaps 0x60(rsp),xmm12
1845 .byte 0x22,0xb8,0x05,0x00 #vmovaps 0x50(rsp),xmm11
1846 .byte 0x1d,0xa8,0x04,0x00 #vmovaps 0x40(rsp),xmm10
1847 .byte 0x18,0x98,0x03,0x00 #vmovaps 0x30(rsp),xmm9
1848 .byte 0x13,0x88,0x02,0x00 #vmovaps 0x20(rsp),xmm8
1849 .byte 0x0e,0x78,0x01,0x00 #vmovaps 0x10(rsp),xmm7
1850 .byte 0x09,0x68,0x00,0x00 #vmovaps 0x00(rsp),xmm6
1851 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
1902 .byte 0x01,0x36,0x17,0x0b
1903 .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
1904 .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
1905 .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
1906 .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
1907 .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
1908 .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
1909 .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
1910 .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
1911 .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
1912 .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
1913 .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8
1914 .byte 0x00,0xb3,0x00,0x00 # set_frame r11
1852___
1853}
1854
1855foreach (split("\n",$code)) {
1856 s/\`([^\`]*)\`/eval($1)/ge;
1857
1858 s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or
1859

--- 39 unchanged lines hidden ---
1915___
1916}
1917
1918foreach (split("\n",$code)) {
1919 s/\`([^\`]*)\`/eval($1)/ge;
1920
1921 s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or
1922

--- 39 unchanged lines hidden ---