crypto/intel/sha1edp.s

64 	Let W0 = {W[i] W[i+1] W[i+2] W[i+3]} be the current W-vector to be computed, W4 = {W[i-4] W[i-3] W[i-2] W[i-1]} be the previous vector, and so on
67 	1. W0 = rotate_left(left_shift(W4,32) ^ W8 ^ left_shift(concatenate(W16,W12),64) ^ W16,1);
69 	3. W0 += {K,K,K,K}
73 	1. W0 = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
76 	1. In total, we need 8 16-byte registers or memory for W0,W4,...,W28. W0 and W32 can be the same register or memory.
77 	2. The registers are used in a circular buffering mode. For example, we start with W28,W24,...,W0 (with W0 indicating the most recent 16-byte)
78 		i=0, W28,W24,...,W0
121 // symbolizing registers or stack memory with algorithmic variables	W0,W4,...,W28 + W_TMP, W_TMP2, and XMM_SHUFB_BSWAP for code with ssse3 support
125 #define W0  	%xmm2
213 	.macro	W_PRECALC_00_15_1_ssse3			// input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28
267 	.macro	W_PRECALC_00_15_1_nossse3 		// input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28
530 	// i=0 	: W28,W24,W20,W16,W12,W8,W4,W0
532 	W_PRECALC_00_15_1	W0					// convert W_TMP to big-endian, and save W0 = W_TMP
533 	W_PRECALC_00_15_2						// W_TMP = W0 + K
534 	W_PRECALC_00_15_3	3					// (sp) = W_TMP = W0 + K
536 	// i=4	: W24,W20,W16,W12,W8,W4,W0,W28
542 	// i=8  : W20,W16,W12,W8,W4,W0,W28,W24
548 	// i=12 : W16,W12,W8,W4,W0,W28,W24,W20
559 	// i=16 : W12,W8,W4,W0,W28,W24,W20,W16
560 	W_PRECALC_16_31_0	W0,W28,W24,W20,W16
562 	W_PRECALC_16_31_1	W0,W16
569 	// i=20 : W8,W4,W0,W28,W24,W20,W16,W12
579 	// i=24 : W4,W0,W28,W24,W20,W16,W12,W8
589 	// i=28 : W0,W28,W24,W20,W16,W12,W8,W4
599 	// i=32 : W28,W24,W20,W16,W12,W8,W4,W0
600 	W_PRECALC_32_79_0	W28,W8,W4,W0
602 	W_PRECALC_32_79_1	W16,W0
604 	W_PRECALC_32_79_2	W0
606 	W_PRECALC_32_79_3	W0,18,16
611 	// i=36 : W24,W20,W16,W12,W8,W4,W0,W28
613 	W_PRECALC_32_79_0	W24,W4,W0,W28
615 	W_PRECALC_32_79_0_i386	W24,W4,W0,W28
629 	// i=40 : W20,W16,W12,W8,W4,W0,W28,W24
633 	W_PRECALC_32_79_0	W20,W0,W28,W24
635 	W_PRECALC_32_79_0_i386	W20,W0,W28,W24
649 	// i=44 : W16,W12,W8,W4,W0,W28,W24,W20
659 	// i=48 : W12,W8,W4,W0,W28,W24,W20,W16
662 	W_PRECALC_32_79_1	W0,W16
669 	// i=52 : W8,W4,W0,W28,W24,W20,W16,W12
681 	// i=56 : W4,W0,W28,W24,W20,W16,W12,W8
691 	// i=60 : W0,W28,W24,W20,W16,W12,W8,W4
694 	W_PRECALC_32_79_0	W0,W12,W8,W4
703 	// i=64 : W28,W24,W20,W16,W12,W8,W4,W0
704 	W_PRECALC_32_79_0	W28,W8,W4,W0
706 	W_PRECALC_32_79_1	W16,W0
708 	W_PRECALC_32_79_2	W0
710 	W_PRECALC_32_79_3	W0,50,K_XMM
713 	// i=68 : W24,W20,W16,W12,W8,W4,W0,W28
715 	W_PRECALC_32_79_0	W24,W4,W0,W28
717 	W_PRECALC_32_79_0_i386	W24,W4,W0,W28
731 	// i=72 : W20,W16,W12,W8,W4,W0,W28,W24
733 	W_PRECALC_32_79_0	W20,W0,W28,W24
735 	W_PRECALC_32_79_0_i386	W20,W0,W28,W24
751 	// i=76 : W16,W12,W8,W4,W0,W28,W24,W20
764 	// i=0  : W28,W24,W20,W16,W12,W8,W4,W0
767 	W_PRECALC_00_15_1	W0					// convert W_TMP to big-endian, and save W0 = W_TMP
769 	W_PRECALC_00_15_2						// W_TMP = W0 + K
771 	W_PRECALC_00_15_3	3					// (sp) = W_TMP = W0 + K
774 	// i=4  : W24,W20,W16,W12,W8,W4,W0,W28
784 	// i=8  : W20,W16,W12,W8,W4,W0,W28,W24
794 	// i=12 : W16,W12,W8,W4,W0,W28,W24,W20
835 	// i=0 	: W28,W24,W20,W16,W12,W8,W4,W0
837 	W_PRECALC_00_15_1	W0					// convert W_TMP to big-endian, and save W0 = W_TMP
838 	W_PRECALC_00_15_2						// W_TMP = W0 + K
839 	W_PRECALC_00_15_3	3					// (sp) = W_TMP = W0 + K
841 	// i=4	: W24,W20,W16,W12,W8,W4,W0,W28
847 	// i=8  : W20,W16,W12,W8,W4,W0,W28,W24
853 	// i=12 : W16,W12,W8,W4,W0,W28,W24,W20
864 	// circular buffer : W12,W8,W4,W0,W28,W24,W20,W16
865 	W_PRECALC_16_31_0	W0,W28,W24,W20,W16
867 	W_PRECALC_16_31_1	W0,W16
875 	// W8,W4,W0,W28,W24,W20,W16,W12
887 	// W4,W0,W28,W24,W20,W16,W12,W8
899 	// W0,W28,W24,W20,W16,W12,W8,W4
911 	// W28,W24,W20,W16,W12,W8,W4,W0
912 	W_PRECALC_32_79_0	W28,W8,W4,W0
914 	W_PRECALC_32_79_1	W16,W0
916 	W_PRECALC_32_79_2	W0
918 	W_PRECALC_32_79_3	W0,18,16
922 	// W24,W20,W16,W12,W8,W4,W0,W28
924 	W_PRECALC_32_79_0	W24,W4,W0,W28
926 	W_PRECALC_32_79_0_i386	W24,W4,W0,W28
943 	// W20,W16,W12,W8,W4,W0,W28,W24
945 	W_PRECALC_32_79_0	W20,W0,W28,W24
947 	W_PRECALC_32_79_0_i386	W20,W0,W28,W24
962 	// W16,W12,W8,W4,W0,W28,W24,W20
973 	// W12,W8,W4,W0,W28,W24,W20,W16
976 	W_PRECALC_32_79_1	W0,W16
984 	// W8,W4,W0,W28,W24,W20,W16,W12
995 	// W4,W0,W28,W24,W20,W16,W12,W8
1008 	// W0,W28,W24,W20,W16,W12,W8,W4
1009 	W_PRECALC_32_79_0	W0,W12,W8,W4
1019 	// W28,W24,W20,W16,W12,W8,W4,W0
1020 	W_PRECALC_32_79_0	W28,W8,W4,W0
1022 	W_PRECALC_32_79_1	W16,W0
1024 	W_PRECALC_32_79_2	W0
1026 	W_PRECALC_32_79_3	W0,50,K_XMM
1030 	// W24,W20,W16,W12,W8,W4,W0,W28
1032 	W_PRECALC_32_79_0	W24,W4,W0,W28
1034 	W_PRECALC_32_79_0_i386	W24,W4,W0,W28
1049 	// W20,W16,W12,W8,W4,W0,W28,W24
1051 	W_PRECALC_32_79_0	W20,W0,W28,W24
1053 	W_PRECALC_32_79_0_i386	W20,W0,W28,W24
1070 	// W16,W12,W8,W4,W0,W28,W24,W20
1083 	// i=0  : W28,W24,W20,W16,W12,W8,W4,W0
1086 	W_PRECALC_00_15_1	W0					// convert W_TMP to big-endian, and save W0 = W_TMP
1088 	W_PRECALC_00_15_2						// W_TMP = W0 + K
1090 	W_PRECALC_00_15_3	3					// (sp) = W_TMP = W0 + K
1093 	// i=4  : W24,W20,W16,W12,W8,W4,W0,W28
1103 	// i=8  : W20,W16,W12,W8,W4,W0,W28,W24
1113 	// i=12 : W16,W12,W8,W4,W0,W28,W24,W20