Lines Matching refs:W0

64 	Let W0 = {W[i] W[i+1] W[i+2] W[i+3]} be the current W-vector to be computed, W4 = {W[i-4] W[i-3] W[i-2] W[i-1]} be the previous vector, and so on
67 1. W0 = rotate_left(left_shift(W4,32) ^ W8 ^ left_shift(concatenate(W16,W12),64) ^ W16,1);
69 3. W0 += {K,K,K,K}
73 1. W0 = rotate_left(left_shift(concatenate(W8,W4),64) ^ W16 ^ W28 ^ W32, 2);
76 1. In total, we need 8 16-byte registers or memory for W0,W4,...,W28. W0 and W32 can be the same register or memory.
77 2. The registers are used in a circular buffering mode. For example, we start with W28,W24,...,W0 (with W0 indicating the most recent 16-byte)
78 i=0, W28,W24,...,W0
121 // symbolizing registers or stack memory with algorithmic variables W0,W4,...,W28 + W_TMP, W_TMP2, and XMM_SHUFB_BSWAP for code with ssse3 support
125 #define W0 %xmm2
213 .macro W_PRECALC_00_15_1_ssse3 // input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28
267 .macro W_PRECALC_00_15_1_nossse3 // input argument $0 : current 16-bytes in the circular buffer, one of W0,W4,W8,...,W28
530 // i=0 : W28,W24,W20,W16,W12,W8,W4,W0
532 W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP
533 W_PRECALC_00_15_2 // W_TMP = W0 + K
534 W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K
536 // i=4 : W24,W20,W16,W12,W8,W4,W0,W28
542 // i=8 : W20,W16,W12,W8,W4,W0,W28,W24
548 // i=12 : W16,W12,W8,W4,W0,W28,W24,W20
559 // i=16 : W12,W8,W4,W0,W28,W24,W20,W16
560 W_PRECALC_16_31_0 W0,W28,W24,W20,W16
562 W_PRECALC_16_31_1 W0,W16
569 // i=20 : W8,W4,W0,W28,W24,W20,W16,W12
579 // i=24 : W4,W0,W28,W24,W20,W16,W12,W8
589 // i=28 : W0,W28,W24,W20,W16,W12,W8,W4
599 // i=32 : W28,W24,W20,W16,W12,W8,W4,W0
600 W_PRECALC_32_79_0 W28,W8,W4,W0
602 W_PRECALC_32_79_1 W16,W0
604 W_PRECALC_32_79_2 W0
606 W_PRECALC_32_79_3 W0,18,16
611 // i=36 : W24,W20,W16,W12,W8,W4,W0,W28
613 W_PRECALC_32_79_0 W24,W4,W0,W28
615 W_PRECALC_32_79_0_i386 W24,W4,W0,W28
629 // i=40 : W20,W16,W12,W8,W4,W0,W28,W24
633 W_PRECALC_32_79_0 W20,W0,W28,W24
635 W_PRECALC_32_79_0_i386 W20,W0,W28,W24
649 // i=44 : W16,W12,W8,W4,W0,W28,W24,W20
659 // i=48 : W12,W8,W4,W0,W28,W24,W20,W16
662 W_PRECALC_32_79_1 W0,W16
669 // i=52 : W8,W4,W0,W28,W24,W20,W16,W12
681 // i=56 : W4,W0,W28,W24,W20,W16,W12,W8
691 // i=60 : W0,W28,W24,W20,W16,W12,W8,W4
694 W_PRECALC_32_79_0 W0,W12,W8,W4
703 // i=64 : W28,W24,W20,W16,W12,W8,W4,W0
704 W_PRECALC_32_79_0 W28,W8,W4,W0
706 W_PRECALC_32_79_1 W16,W0
708 W_PRECALC_32_79_2 W0
710 W_PRECALC_32_79_3 W0,50,K_XMM
713 // i=68 : W24,W20,W16,W12,W8,W4,W0,W28
715 W_PRECALC_32_79_0 W24,W4,W0,W28
717 W_PRECALC_32_79_0_i386 W24,W4,W0,W28
731 // i=72 : W20,W16,W12,W8,W4,W0,W28,W24
733 W_PRECALC_32_79_0 W20,W0,W28,W24
735 W_PRECALC_32_79_0_i386 W20,W0,W28,W24
751 // i=76 : W16,W12,W8,W4,W0,W28,W24,W20
764 // i=0 : W28,W24,W20,W16,W12,W8,W4,W0
767 W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP
769 W_PRECALC_00_15_2 // W_TMP = W0 + K
771 W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K
774 // i=4 : W24,W20,W16,W12,W8,W4,W0,W28
784 // i=8 : W20,W16,W12,W8,W4,W0,W28,W24
794 // i=12 : W16,W12,W8,W4,W0,W28,W24,W20
835 // i=0 : W28,W24,W20,W16,W12,W8,W4,W0
837 W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP
838 W_PRECALC_00_15_2 // W_TMP = W0 + K
839 W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K
841 // i=4 : W24,W20,W16,W12,W8,W4,W0,W28
847 // i=8 : W20,W16,W12,W8,W4,W0,W28,W24
853 // i=12 : W16,W12,W8,W4,W0,W28,W24,W20
864 // circular buffer : W12,W8,W4,W0,W28,W24,W20,W16
865 W_PRECALC_16_31_0 W0,W28,W24,W20,W16
867 W_PRECALC_16_31_1 W0,W16
875 // W8,W4,W0,W28,W24,W20,W16,W12
887 // W4,W0,W28,W24,W20,W16,W12,W8
899 // W0,W28,W24,W20,W16,W12,W8,W4
911 // W28,W24,W20,W16,W12,W8,W4,W0
912 W_PRECALC_32_79_0 W28,W8,W4,W0
914 W_PRECALC_32_79_1 W16,W0
916 W_PRECALC_32_79_2 W0
918 W_PRECALC_32_79_3 W0,18,16
922 // W24,W20,W16,W12,W8,W4,W0,W28
924 W_PRECALC_32_79_0 W24,W4,W0,W28
926 W_PRECALC_32_79_0_i386 W24,W4,W0,W28
943 // W20,W16,W12,W8,W4,W0,W28,W24
945 W_PRECALC_32_79_0 W20,W0,W28,W24
947 W_PRECALC_32_79_0_i386 W20,W0,W28,W24
962 // W16,W12,W8,W4,W0,W28,W24,W20
973 // W12,W8,W4,W0,W28,W24,W20,W16
976 W_PRECALC_32_79_1 W0,W16
984 // W8,W4,W0,W28,W24,W20,W16,W12
995 // W4,W0,W28,W24,W20,W16,W12,W8
1008 // W0,W28,W24,W20,W16,W12,W8,W4
1009 W_PRECALC_32_79_0 W0,W12,W8,W4
1019 // W28,W24,W20,W16,W12,W8,W4,W0
1020 W_PRECALC_32_79_0 W28,W8,W4,W0
1022 W_PRECALC_32_79_1 W16,W0
1024 W_PRECALC_32_79_2 W0
1026 W_PRECALC_32_79_3 W0,50,K_XMM
1030 // W24,W20,W16,W12,W8,W4,W0,W28
1032 W_PRECALC_32_79_0 W24,W4,W0,W28
1034 W_PRECALC_32_79_0_i386 W24,W4,W0,W28
1049 // W20,W16,W12,W8,W4,W0,W28,W24
1051 W_PRECALC_32_79_0 W20,W0,W28,W24
1053 W_PRECALC_32_79_0_i386 W20,W0,W28,W24
1070 // W16,W12,W8,W4,W0,W28,W24,W20
1083 // i=0 : W28,W24,W20,W16,W12,W8,W4,W0
1086 W_PRECALC_00_15_1 W0 // convert W_TMP to big-endian, and save W0 = W_TMP
1088 W_PRECALC_00_15_2 // W_TMP = W0 + K
1090 W_PRECALC_00_15_3 3 // (sp) = W_TMP = W0 + K
1093 // i=4 : W24,W20,W16,W12,W8,W4,W0,W28
1103 // i=8 : W20,W16,W12,W8,W4,W0,W28,W24
1113 // i=12 : W16,W12,W8,W4,W0,W28,W24,W20