Deleted Added
full compact
aes-586.pl (160814) aes-586.pl (162911)
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. Rights for redistribution and usage in source and binary
6# forms are granted according to the OpenSSL license.
7# ====================================================================
8#
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. Rights for redistribution and usage in source and binary
6# forms are granted according to the OpenSSL license.
7# ====================================================================
8#
9# Version 3.4.
9# Version 3.6.
10#
11# You might fail to appreciate this module performance from the first
12# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
13# to be *the* best Intel C compiler without -KPIC, performance appears
14# to be virtually identical... But try to re-configure with shared
15# library support... Aha! Intel compiler "suddenly" lags behind by 30%
16# [on P4, more on others]:-) And if compared to position-independent
17# code generated by GNU C, this code performs *more* than *twice* as

--- 43 unchanged lines hidden (view full) ---

61# implementations, Pentium suffered 30% penalty, PIII - 10%.
62#
63# Version 3.3 avoids L1 cache aliasing between stack frame and
64# S-boxes, and 3.4 - L1 cache aliasing even between key schedule. The
65# latter is achieved by copying the key schedule to controlled place in
66# stack. This unfortunately has rather strong impact on small block CBC
67# performance, ~2x deterioration on 16-byte block if compared to 3.3.
68#
10#
11# You might fail to appreciate this module performance from the first
12# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
13# to be *the* best Intel C compiler without -KPIC, performance appears
14# to be virtually identical... But try to re-configure with shared
15# library support... Aha! Intel compiler "suddenly" lags behind by 30%
16# [on P4, more on others]:-) And if compared to position-independent
17# code generated by GNU C, this code performs *more* than *twice* as

--- 43 unchanged lines hidden (view full) ---

61# implementations, Pentium suffered 30% penalty, PIII - 10%.
62#
63# Version 3.3 avoids L1 cache aliasing between stack frame and
64# S-boxes, and 3.4 - L1 cache aliasing even between key schedule. The
65# latter is achieved by copying the key schedule to controlled place in
66# stack. This unfortunately has rather strong impact on small block CBC
67# performance, ~2x deterioration on 16-byte block if compared to 3.3.
68#
69# Version 3.5 checks if there is L1 cache aliasing between user-supplied
70# key schedule and S-boxes and abstains from copying the former if
71# there is no. This allows end-user to consciously retain small block
72# performance by aligning key schedule in specific manner.
73#
74# Version 3.6 compresses Td4 to 256 bytes and prefetches it in ECB.
75#
69# Current ECB performance numbers for 128-bit key in CPU cycles per
70# processed byte [measure commonly used by AES benchmarkers] are:
71#
72# small footprint fully unrolled
73# P4 24 22
74# AMD K8 20 19
75# PIII 25 23
76# Pentium 81 78

--- 423 unchanged lines hidden (view full) ---

500sub declast()
501{ my ($i,$td,@s)=@_;
502 my $tmp = $key;
503 my $out = $i==3?$s[0]:$acc;
504
505 if($i==3) { &mov ($key,&DWP(12,"esp")); }
506 else { &mov ($out,$s[0]); }
507 &and ($out,0xFF);
76# Current ECB performance numbers for 128-bit key in CPU cycles per
77# processed byte [measure commonly used by AES benchmarkers] are:
78#
79# small footprint fully unrolled
80# P4 24 22
81# AMD K8 20 19
82# PIII 25 23
83# Pentium 81 78

--- 423 unchanged lines hidden (view full) ---

507sub declast()
508{ my ($i,$td,@s)=@_;
509 my $tmp = $key;
510 my $out = $i==3?$s[0]:$acc;
511
512 if($i==3) { &mov ($key,&DWP(12,"esp")); }
513 else { &mov ($out,$s[0]); }
514 &and ($out,0xFF);
508 &mov ($out,&DWP(2048,$td,$out,4));
509 &and ($out,0x000000ff);
515 &movz ($out,&DWP(2048,$td,$out,1));
510
511 if ($i==3) { $tmp=$s[1]; }
512 &movz ($tmp,&HB($s[1]));
516
517 if ($i==3) { $tmp=$s[1]; }
518 &movz ($tmp,&HB($s[1]));
513 &mov ($tmp,&DWP(2048,$td,$tmp,4));
514 &and ($tmp,0x0000ff00);
519 &movz ($tmp,&DWP(2048,$td,$tmp,1));
520 &shl ($tmp,8);
515 &xor ($out,$tmp);
516
517 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
518 else { mov ($tmp,$s[2]); }
519 &shr ($tmp,16);
520 &and ($tmp,0xFF);
521 &xor ($out,$tmp);
522
523 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
524 else { mov ($tmp,$s[2]); }
525 &shr ($tmp,16);
526 &and ($tmp,0xFF);
521 &mov ($tmp,&DWP(2048,$td,$tmp,4));
522 &and ($tmp,0x00ff0000);
527 &movz ($tmp,&DWP(2048,$td,$tmp,1));
528 &shl ($tmp,16);
523 &xor ($out,$tmp);
524
525 if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }
526 else { &mov ($tmp,$s[3]); }
527 &shr ($tmp,24);
529 &xor ($out,$tmp);
530
531 if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }
532 else { &mov ($tmp,$s[3]); }
533 &shr ($tmp,24);
528 &mov ($tmp,&DWP(2048,$td,$tmp,4));
529 &and ($tmp,0xff000000);
534 &movz ($tmp,&DWP(2048,$td,$tmp,1));
535 &shl ($tmp,24);
530 &xor ($out,$tmp);
531 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
532 if ($i==3) { &mov ($s[3],&DWP(4,"esp")); }
533}
534
535&public_label("AES_Td");
536&function_begin_B("_x86_AES_decrypt");
537 # note that caller is expected to allocate stack frame for me!

--- 144 unchanged lines hidden (view full) ---

682 &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
683 &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
684 &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
685 &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
686 &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
687 &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
688 &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
689#Td4:
536 &xor ($out,$tmp);
537 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
538 if ($i==3) { &mov ($s[3],&DWP(4,"esp")); }
539}
540
541&public_label("AES_Td");
542&function_begin_B("_x86_AES_decrypt");
543 # note that caller is expected to allocate stack frame for me!

--- 144 unchanged lines hidden (view full) ---

688 &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
689 &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
690 &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
691 &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
692 &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
693 &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
694 &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
695#Td4:
690 &data_word(0x52525252, 0x09090909, 0x6a6a6a6a, 0xd5d5d5d5);
691 &data_word(0x30303030, 0x36363636, 0xa5a5a5a5, 0x38383838);
692 &data_word(0xbfbfbfbf, 0x40404040, 0xa3a3a3a3, 0x9e9e9e9e);
693 &data_word(0x81818181, 0xf3f3f3f3, 0xd7d7d7d7, 0xfbfbfbfb);
694 &data_word(0x7c7c7c7c, 0xe3e3e3e3, 0x39393939, 0x82828282);
695 &data_word(0x9b9b9b9b, 0x2f2f2f2f, 0xffffffff, 0x87878787);
696 &data_word(0x34343434, 0x8e8e8e8e, 0x43434343, 0x44444444);
697 &data_word(0xc4c4c4c4, 0xdededede, 0xe9e9e9e9, 0xcbcbcbcb);
698 &data_word(0x54545454, 0x7b7b7b7b, 0x94949494, 0x32323232);
699 &data_word(0xa6a6a6a6, 0xc2c2c2c2, 0x23232323, 0x3d3d3d3d);
700 &data_word(0xeeeeeeee, 0x4c4c4c4c, 0x95959595, 0x0b0b0b0b);
701 &data_word(0x42424242, 0xfafafafa, 0xc3c3c3c3, 0x4e4e4e4e);
702 &data_word(0x08080808, 0x2e2e2e2e, 0xa1a1a1a1, 0x66666666);
703 &data_word(0x28282828, 0xd9d9d9d9, 0x24242424, 0xb2b2b2b2);
704 &data_word(0x76767676, 0x5b5b5b5b, 0xa2a2a2a2, 0x49494949);
705 &data_word(0x6d6d6d6d, 0x8b8b8b8b, 0xd1d1d1d1, 0x25252525);
706 &data_word(0x72727272, 0xf8f8f8f8, 0xf6f6f6f6, 0x64646464);
707 &data_word(0x86868686, 0x68686868, 0x98989898, 0x16161616);
708 &data_word(0xd4d4d4d4, 0xa4a4a4a4, 0x5c5c5c5c, 0xcccccccc);
709 &data_word(0x5d5d5d5d, 0x65656565, 0xb6b6b6b6, 0x92929292);
710 &data_word(0x6c6c6c6c, 0x70707070, 0x48484848, 0x50505050);
711 &data_word(0xfdfdfdfd, 0xedededed, 0xb9b9b9b9, 0xdadadada);
712 &data_word(0x5e5e5e5e, 0x15151515, 0x46464646, 0x57575757);
713 &data_word(0xa7a7a7a7, 0x8d8d8d8d, 0x9d9d9d9d, 0x84848484);
714 &data_word(0x90909090, 0xd8d8d8d8, 0xabababab, 0x00000000);
715 &data_word(0x8c8c8c8c, 0xbcbcbcbc, 0xd3d3d3d3, 0x0a0a0a0a);
716 &data_word(0xf7f7f7f7, 0xe4e4e4e4, 0x58585858, 0x05050505);
717 &data_word(0xb8b8b8b8, 0xb3b3b3b3, 0x45454545, 0x06060606);
718 &data_word(0xd0d0d0d0, 0x2c2c2c2c, 0x1e1e1e1e, 0x8f8f8f8f);
719 &data_word(0xcacacaca, 0x3f3f3f3f, 0x0f0f0f0f, 0x02020202);
720 &data_word(0xc1c1c1c1, 0xafafafaf, 0xbdbdbdbd, 0x03030303);
721 &data_word(0x01010101, 0x13131313, 0x8a8a8a8a, 0x6b6b6b6b);
722 &data_word(0x3a3a3a3a, 0x91919191, 0x11111111, 0x41414141);
723 &data_word(0x4f4f4f4f, 0x67676767, 0xdcdcdcdc, 0xeaeaeaea);
724 &data_word(0x97979797, 0xf2f2f2f2, 0xcfcfcfcf, 0xcececece);
725 &data_word(0xf0f0f0f0, 0xb4b4b4b4, 0xe6e6e6e6, 0x73737373);
726 &data_word(0x96969696, 0xacacacac, 0x74747474, 0x22222222);
727 &data_word(0xe7e7e7e7, 0xadadadad, 0x35353535, 0x85858585);
728 &data_word(0xe2e2e2e2, 0xf9f9f9f9, 0x37373737, 0xe8e8e8e8);
729 &data_word(0x1c1c1c1c, 0x75757575, 0xdfdfdfdf, 0x6e6e6e6e);
730 &data_word(0x47474747, 0xf1f1f1f1, 0x1a1a1a1a, 0x71717171);
731 &data_word(0x1d1d1d1d, 0x29292929, 0xc5c5c5c5, 0x89898989);
732 &data_word(0x6f6f6f6f, 0xb7b7b7b7, 0x62626262, 0x0e0e0e0e);
733 &data_word(0xaaaaaaaa, 0x18181818, 0xbebebebe, 0x1b1b1b1b);
734 &data_word(0xfcfcfcfc, 0x56565656, 0x3e3e3e3e, 0x4b4b4b4b);
735 &data_word(0xc6c6c6c6, 0xd2d2d2d2, 0x79797979, 0x20202020);
736 &data_word(0x9a9a9a9a, 0xdbdbdbdb, 0xc0c0c0c0, 0xfefefefe);
737 &data_word(0x78787878, 0xcdcdcdcd, 0x5a5a5a5a, 0xf4f4f4f4);
738 &data_word(0x1f1f1f1f, 0xdddddddd, 0xa8a8a8a8, 0x33333333);
739 &data_word(0x88888888, 0x07070707, 0xc7c7c7c7, 0x31313131);
740 &data_word(0xb1b1b1b1, 0x12121212, 0x10101010, 0x59595959);
741 &data_word(0x27272727, 0x80808080, 0xecececec, 0x5f5f5f5f);
742 &data_word(0x60606060, 0x51515151, 0x7f7f7f7f, 0xa9a9a9a9);
743 &data_word(0x19191919, 0xb5b5b5b5, 0x4a4a4a4a, 0x0d0d0d0d);
744 &data_word(0x2d2d2d2d, 0xe5e5e5e5, 0x7a7a7a7a, 0x9f9f9f9f);
745 &data_word(0x93939393, 0xc9c9c9c9, 0x9c9c9c9c, 0xefefefef);
746 &data_word(0xa0a0a0a0, 0xe0e0e0e0, 0x3b3b3b3b, 0x4d4d4d4d);
747 &data_word(0xaeaeaeae, 0x2a2a2a2a, 0xf5f5f5f5, 0xb0b0b0b0);
748 &data_word(0xc8c8c8c8, 0xebebebeb, 0xbbbbbbbb, 0x3c3c3c3c);
749 &data_word(0x83838383, 0x53535353, 0x99999999, 0x61616161);
750 &data_word(0x17171717, 0x2b2b2b2b, 0x04040404, 0x7e7e7e7e);
751 &data_word(0xbabababa, 0x77777777, 0xd6d6d6d6, 0x26262626);
752 &data_word(0xe1e1e1e1, 0x69696969, 0x14141414, 0x63636363);
753 &data_word(0x55555555, 0x21212121, 0x0c0c0c0c, 0x7d7d7d7d);
696 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
697 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
698 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
699 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
700 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
701 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
702 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
703 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
704 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
705 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
706 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
707 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
708 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
709 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
710 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
711 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
712 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
713 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
714 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
715 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
716 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
717 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
718 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
719 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
720 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
721 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
722 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
723 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
724 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
725 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
726 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
727 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
754&function_end_B("_x86_AES_decrypt");
755
756# void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
757&public_label("AES_Td");
758&function_begin("AES_decrypt");
759 &mov ($acc,&wparam(0)); # load inp
760 &mov ($key,&wparam(2)); # load key
761
762 &mov ($s0,"esp");
763 &sub ("esp",24);
764 &and ("esp",-64);
765 &add ("esp",4);
766 &mov (&DWP(16,"esp"),$s0);
767
768 &call (&label("pic_point")); # make it PIC!
769 &set_label("pic_point");
770 &blindpop("ebp");
771 &lea ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp"));
772
728&function_end_B("_x86_AES_decrypt");
729
730# void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
731&public_label("AES_Td");
732&function_begin("AES_decrypt");
733 &mov ($acc,&wparam(0)); # load inp
734 &mov ($key,&wparam(2)); # load key
735
736 &mov ($s0,"esp");
737 &sub ("esp",24);
738 &and ("esp",-64);
739 &add ("esp",4);
740 &mov (&DWP(16,"esp"),$s0);
741
742 &call (&label("pic_point")); # make it PIC!
743 &set_label("pic_point");
744 &blindpop("ebp");
745 &lea ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp"));
746
747 # prefetch Td4
748 &lea ("ebp",&DWP(2048+128,"ebp"));
749 &mov ($s0,&DWP(0-128,"ebp"));
750 &mov ($s1,&DWP(32-128,"ebp"));
751 &mov ($s2,&DWP(64-128,"ebp"));
752 &mov ($s3,&DWP(96-128,"ebp"));
753 &mov ($s0,&DWP(128-128,"ebp"));
754 &mov ($s1,&DWP(160-128,"ebp"));
755 &mov ($s2,&DWP(192-128,"ebp"));
756 &mov ($s3,&DWP(224-128,"ebp"));
757 &lea ("ebp",&DWP(-2048-128,"ebp"));
758
773 &mov ($s0,&DWP(0,$acc)); # load input data
774 &mov ($s1,&DWP(4,$acc));
775 &mov ($s2,&DWP(8,$acc));
776 &mov ($s3,&DWP(12,$acc));
777
778 &call ("_x86_AES_decrypt");
779
780 &mov ("esp",&DWP(16,"esp"));

--- 19 unchanged lines hidden (view full) ---

800my $_inp=&DWP(20,"esp"); #copy of wparam(0)
801my $_out=&DWP(24,"esp"); #copy of wparam(1)
802my $_len=&DWP(28,"esp"); #copy of wparam(2)
803my $_key=&DWP(32,"esp"); #copy of wparam(3)
804my $_ivp=&DWP(36,"esp"); #copy of wparam(4)
805my $_tmp=&DWP(40,"esp"); #volatile variable
806my $ivec=&DWP(44,"esp"); #ivec[16]
807my $aes_key=&DWP(60,"esp"); #copy of aes_key
759 &mov ($s0,&DWP(0,$acc)); # load input data
760 &mov ($s1,&DWP(4,$acc));
761 &mov ($s2,&DWP(8,$acc));
762 &mov ($s3,&DWP(12,$acc));
763
764 &call ("_x86_AES_decrypt");
765
766 &mov ("esp",&DWP(16,"esp"));

--- 19 unchanged lines hidden (view full) ---

786my $_inp=&DWP(20,"esp"); #copy of wparam(0)
787my $_out=&DWP(24,"esp"); #copy of wparam(1)
788my $_len=&DWP(28,"esp"); #copy of wparam(2)
789my $_key=&DWP(32,"esp"); #copy of wparam(3)
790my $_ivp=&DWP(36,"esp"); #copy of wparam(4)
791my $_tmp=&DWP(40,"esp"); #volatile variable
792my $ivec=&DWP(44,"esp"); #ivec[16]
793my $aes_key=&DWP(60,"esp"); #copy of aes_key
794my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds
808
809&public_label("AES_Te");
810&public_label("AES_Td");
811&function_begin("AES_cbc_encrypt");
812 &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len
813 &cmp ($s2,0);
814 &je (&label("enc_out"));
815

--- 44 unchanged lines hidden (view full) ---

860 &mov ($_esp,$key); # save %esp
861
862 &mov ($_inp,$s0); # save copy of inp
863 &mov ($_out,$s1); # save copy of out
864 &mov ($_len,$s2); # save copy of len
865 &mov ($_key,$s3); # save copy of key
866 &mov ($_ivp,$acc); # save copy of ivp
867
795
796&public_label("AES_Te");
797&public_label("AES_Td");
798&function_begin("AES_cbc_encrypt");
799 &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len
800 &cmp ($s2,0);
801 &je (&label("enc_out"));
802

--- 44 unchanged lines hidden (view full) ---

847 &mov ($_esp,$key); # save %esp
848
849 &mov ($_inp,$s0); # save copy of inp
850 &mov ($_out,$s1); # save copy of out
851 &mov ($_len,$s2); # save copy of len
852 &mov ($_key,$s3); # save copy of key
853 &mov ($_ivp,$acc); # save copy of ivp
854
855 &mov ($mark,0); # copy of aes_key->rounds = 0;
868 if ($compromise) {
869 &cmp ($s2,$compromise);
870 &jb (&label("skip_ecopy"));
871 }
856 if ($compromise) {
857 &cmp ($s2,$compromise);
858 &jb (&label("skip_ecopy"));
859 }
872 # copy key schedule to stack
873 &mov ("ecx",244/4);
860 # do we copy key schedule to stack?
861 &mov ($s1 eq "ebx" ? $s1 : "",$s3);
862 &mov ($s2 eq "ecx" ? $s2 : "",244/4);
863 &sub ($s1,"ebp");
874 &mov ("esi",$s3);
864 &mov ("esi",$s3);
865 &and ($s1,0xfff);
875 &lea ("edi",$aes_key);
866 &lea ("edi",$aes_key);
876 &mov ($_key,"edi");
867 &cmp ($s1,2048);
868 &jb (&label("do_ecopy"));
869 &cmp ($s1,4096-244);
870 &jb (&label("skip_ecopy"));
877 &align (4);
871 &align (4);
878 &data_word(0xF689A5F3); # rep movsd
879 &set_label("skip_ecopy") if ($compromise);
872 &set_label("do_ecopy");
873 &mov ($_key,"edi");
874 &data_word(0xA5F3F689); # rep movsd
875 &set_label("skip_ecopy");
880
881 &mov ($acc,$s0);
882 &mov ($key,16);
883 &align (4);
884 &set_label("prefetch_te");
885 &mov ($s0,&DWP(0,"ebp"));
886 &mov ($s1,&DWP(32,"ebp"));
887 &mov ($s2,&DWP(64,"ebp"));

--- 49 unchanged lines hidden (view full) ---

937 &mov ($acc,$_ivp); # load ivp
938 &mov ($s2,&DWP(8,$key)); # restore last dwords
939 &mov ($s3,&DWP(12,$key));
940 &mov (&DWP(0,$acc),$s0); # save ivec
941 &mov (&DWP(4,$acc),$s1);
942 &mov (&DWP(8,$acc),$s2);
943 &mov (&DWP(12,$acc),$s3);
944
876
877 &mov ($acc,$s0);
878 &mov ($key,16);
879 &align (4);
880 &set_label("prefetch_te");
881 &mov ($s0,&DWP(0,"ebp"));
882 &mov ($s1,&DWP(32,"ebp"));
883 &mov ($s2,&DWP(64,"ebp"));

--- 49 unchanged lines hidden (view full) ---

933 &mov ($acc,$_ivp); # load ivp
934 &mov ($s2,&DWP(8,$key)); # restore last dwords
935 &mov ($s3,&DWP(12,$key));
936 &mov (&DWP(0,$acc),$s0); # save ivec
937 &mov (&DWP(4,$acc),$s1);
938 &mov (&DWP(8,$acc),$s2);
939 &mov (&DWP(12,$acc),$s3);
940
941 &cmp ($mark,0); # was the key schedule copied?
945 &mov ("edi",$_key);
946 &mov ("esp",$_esp);
942 &mov ("edi",$_key);
943 &mov ("esp",$_esp);
947 if ($compromise) {
948 &cmp (&wparam(2),$compromise);
949 &jb (&label("skip_ezero"));
950 }
944 &je (&label("skip_ezero"));
951 # zero copy of key schedule
952 &mov ("ecx",240/4);
953 &xor ("eax","eax");
954 &align (4);
945 # zero copy of key schedule
946 &mov ("ecx",240/4);
947 &xor ("eax","eax");
948 &align (4);
955 &data_word(0xF689ABF3); # rep stosd
956 &set_label("skip_ezero") if ($compromise);
949 &data_word(0xABF3F689); # rep stosd
950 &set_label("skip_ezero")
957 &popf ();
958 &set_label("enc_out");
959 &function_end_A();
960 &pushf (); # kludge, never executed
961
962 &align (4);
963 &set_label("enc_tail");
964 &push ($key eq "edi" ? $key : ""); # push ivp
965 &mov ($key,$_out); # load out
966 &mov ($s1,16);
967 &sub ($s1,$s2);
968 &cmp ($key,$acc); # compare with inp
969 &je (&label("enc_in_place"));
970 &align (4);
951 &popf ();
952 &set_label("enc_out");
953 &function_end_A();
954 &pushf (); # kludge, never executed
955
956 &align (4);
957 &set_label("enc_tail");
958 &push ($key eq "edi" ? $key : ""); # push ivp
959 &mov ($key,$_out); # load out
960 &mov ($s1,16);
961 &sub ($s1,$s2);
962 &cmp ($key,$acc); # compare with inp
963 &je (&label("enc_in_place"));
964 &align (4);
971 &data_word(0xF689A4F3); # rep movsb # copy input
965 &data_word(0xA4F3F689); # rep movsb # copy input
972 &jmp (&label("enc_skip_in_place"));
973 &set_label("enc_in_place");
974 &lea ($key,&DWP(0,$key,$s2));
975 &set_label("enc_skip_in_place");
976 &mov ($s2,$s1);
977 &xor ($s0,$s0);
978 &align (4);
966 &jmp (&label("enc_skip_in_place"));
967 &set_label("enc_in_place");
968 &lea ($key,&DWP(0,$key,$s2));
969 &set_label("enc_skip_in_place");
970 &mov ($s2,$s1);
971 &xor ($s0,$s0);
972 &align (4);
979 &data_word(0xF689AAF3); # rep stosb # zero tail
973 &data_word(0xAAF3F689); # rep stosb # zero tail
980 &pop ($key); # pop ivp
981
982 &mov ($acc,$_out); # output as input
983 &mov ($s0,&DWP(0,$key));
984 &mov ($s1,&DWP(4,$key));
985 &mov ($_len,16); # len=16
986 &jmp (&label("enc_loop")); # one more spin...
987
988#----------------------------- DECRYPT -----------------------------#
989&align (4);
990&set_label("DECRYPT");
991 &lea ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp"));
992
993 # allocate aligned stack frame...
994 &lea ($key,&DWP(-64-244,"esp"));
995 &and ($key,-64);
996
997 # ... and make sure it doesn't alias with AES_Td modulo 4096
998 &mov ($s0,"ebp");
974 &pop ($key); # pop ivp
975
976 &mov ($acc,$_out); # output as input
977 &mov ($s0,&DWP(0,$key));
978 &mov ($s1,&DWP(4,$key));
979 &mov ($_len,16); # len=16
980 &jmp (&label("enc_loop")); # one more spin...
981
982#----------------------------- DECRYPT -----------------------------#
983&align (4);
984&set_label("DECRYPT");
985 &lea ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp"));
986
987 # allocate aligned stack frame...
988 &lea ($key,&DWP(-64-244,"esp"));
989 &and ($key,-64);
990
991 # ... and make sure it doesn't alias with AES_Td modulo 4096
992 &mov ($s0,"ebp");
999 &lea ($s1,&DWP(3072,"ebp"));
993 &lea ($s1,&DWP(2048+256,"ebp"));
1000 &mov ($s3,$key);
1001 &and ($s0,0xfff); # s = %ebp&0xfff
994 &mov ($s3,$key);
995 &and ($s0,0xfff); # s = %ebp&0xfff
1002 &and ($s1,0xfff); # e = (%ebp+3072)&0xfff
996 &and ($s1,0xfff); # e = (%ebp+2048+256)&0xfff
1003 &and ($s3,0xfff); # p = %esp&0xfff
1004
1005 &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e);
1006 &jb (&label("td_break_out"));
1007 &sub ($s3,$s1);
1008 &sub ($key,$s3);
1009 &jmp (&label("td_ok"));
1010 &set_label("td_break_out"); # else %esp -= (p-s)&0xfff + framesz;

--- 14 unchanged lines hidden (view full) ---

1025 &mov ($_esp,$key); # save %esp
1026
1027 &mov ($_inp,$s0); # save copy of inp
1028 &mov ($_out,$s1); # save copy of out
1029 &mov ($_len,$s2); # save copy of len
1030 &mov ($_key,$s3); # save copy of key
1031 &mov ($_ivp,$acc); # save copy of ivp
1032
997 &and ($s3,0xfff); # p = %esp&0xfff
998
999 &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e);
1000 &jb (&label("td_break_out"));
1001 &sub ($s3,$s1);
1002 &sub ($key,$s3);
1003 &jmp (&label("td_ok"));
1004 &set_label("td_break_out"); # else %esp -= (p-s)&0xfff + framesz;

--- 14 unchanged lines hidden (view full) ---

1019 &mov ($_esp,$key); # save %esp
1020
1021 &mov ($_inp,$s0); # save copy of inp
1022 &mov ($_out,$s1); # save copy of out
1023 &mov ($_len,$s2); # save copy of len
1024 &mov ($_key,$s3); # save copy of key
1025 &mov ($_ivp,$acc); # save copy of ivp
1026
1027 &mov ($mark,0); # copy of aes_key->rounds = 0;
1033 if ($compromise) {
1034 &cmp ($s2,$compromise);
1035 &jb (&label("skip_dcopy"));
1036 }
1028 if ($compromise) {
1029 &cmp ($s2,$compromise);
1030 &jb (&label("skip_dcopy"));
1031 }
1037 # copy key schedule to stack
1038 &mov ("ecx",244/4);
1032 # do we copy key schedule to stack?
1033 &mov ($s1 eq "ebx" ? $s1 : "",$s3);
1034 &mov ($s2 eq "ecx" ? $s2 : "",244/4);
1035 &sub ($s1,"ebp");
1039 &mov ("esi",$s3);
1036 &mov ("esi",$s3);
1037 &and ($s1,0xfff);
1040 &lea ("edi",$aes_key);
1038 &lea ("edi",$aes_key);
1041 &mov ($_key,"edi");
1039 &cmp ($s1,2048+256);
1040 &jb (&label("do_dcopy"));
1041 &cmp ($s1,4096-244);
1042 &jb (&label("skip_dcopy"));
1042 &align (4);
1043 &align (4);
1043 &data_word(0xF689A5F3); # rep movsd
1044 &set_label("skip_dcopy") if ($compromise);
1044 &set_label("do_dcopy");
1045 &mov ($_key,"edi");
1046 &data_word(0xA5F3F689); # rep movsd
1047 &set_label("skip_dcopy");
1045
1046 &mov ($acc,$s0);
1048
1049 &mov ($acc,$s0);
1047 &mov ($key,24);
1050 &mov ($key,18);
1048 &align (4);
1049 &set_label("prefetch_td");
1050 &mov ($s0,&DWP(0,"ebp"));
1051 &mov ($s1,&DWP(32,"ebp"));
1052 &mov ($s2,&DWP(64,"ebp"));
1053 &mov ($s3,&DWP(96,"ebp"));
1054 &lea ("ebp",&DWP(128,"ebp"));
1055 &dec ($key);
1056 &jnz (&label("prefetch_td"));
1051 &align (4);
1052 &set_label("prefetch_td");
1053 &mov ($s0,&DWP(0,"ebp"));
1054 &mov ($s1,&DWP(32,"ebp"));
1055 &mov ($s2,&DWP(64,"ebp"));
1056 &mov ($s3,&DWP(96,"ebp"));
1057 &lea ("ebp",&DWP(128,"ebp"));
1058 &dec ($key);
1059 &jnz (&label("prefetch_td"));
1057 &sub ("ebp",3072);
1060 &sub ("ebp",2048+256);
1058
1059 &cmp ($acc,$_out);
1060 &je (&label("dec_in_place")); # in-place processing...
1061
1062 &mov ($key,$_ivp); # load ivp
1063 &mov ($_tmp,$key);
1064
1065 &align (4);

--- 50 unchanged lines hidden (view full) ---

1116 &lea ($key,$ivec);
1117 &mov (&DWP(0,$key),$s0); # dump output to stack
1118 &mov (&DWP(4,$key),$s1);
1119 &mov (&DWP(8,$key),$s2);
1120 &mov (&DWP(12,$key),$s3);
1121 &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$acc));
1122 &mov ($acc eq "esi" ? $acc : "",$key);
1123 &mov ($key eq "edi" ? $key : "",$_out); # load out
1061
1062 &cmp ($acc,$_out);
1063 &je (&label("dec_in_place")); # in-place processing...
1064
1065 &mov ($key,$_ivp); # load ivp
1066 &mov ($_tmp,$key);
1067
1068 &align (4);

--- 50 unchanged lines hidden (view full) ---

1119 &lea ($key,$ivec);
1120 &mov (&DWP(0,$key),$s0); # dump output to stack
1121 &mov (&DWP(4,$key),$s1);
1122 &mov (&DWP(8,$key),$s2);
1123 &mov (&DWP(12,$key),$s3);
1124 &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$acc));
1125 &mov ($acc eq "esi" ? $acc : "",$key);
1126 &mov ($key eq "edi" ? $key : "",$_out); # load out
1124 &data_word(0xF689A4F3); # rep movsb # copy output
1127 &data_word(0xA4F3F689); # rep movsb # copy output
1125 &mov ($key,$_inp); # use inp as temp ivp
1126 &jmp (&label("dec_end"));
1127
1128 &align (4);
1129 &set_label("dec_in_place");
1130 &set_label("dec_in_place_loop");
1131 &lea ($key,$ivec);
1132 &mov ($s0,&DWP(0,$acc)); # read input

--- 50 unchanged lines hidden (view full) ---

1183 &align (4);
1184 &set_label("dec_in_place_partial");
1185 # one can argue if this is actually required...
1186 &mov ($key eq "edi" ? $key : "",$_out);
1187 &lea ($acc eq "esi" ? $acc : "",$ivec);
1188 &lea ($key,&DWP(0,$key,$s2));
1189 &lea ($acc,&DWP(16,$acc,$s2));
1190 &neg ($s2 eq "ecx" ? $s2 : "");
1128 &mov ($key,$_inp); # use inp as temp ivp
1129 &jmp (&label("dec_end"));
1130
1131 &align (4);
1132 &set_label("dec_in_place");
1133 &set_label("dec_in_place_loop");
1134 &lea ($key,$ivec);
1135 &mov ($s0,&DWP(0,$acc)); # read input

--- 50 unchanged lines hidden (view full) ---

1186 &align (4);
1187 &set_label("dec_in_place_partial");
1188 # one can argue if this is actually required...
1189 &mov ($key eq "edi" ? $key : "",$_out);
1190 &lea ($acc eq "esi" ? $acc : "",$ivec);
1191 &lea ($key,&DWP(0,$key,$s2));
1192 &lea ($acc,&DWP(16,$acc,$s2));
1193 &neg ($s2 eq "ecx" ? $s2 : "");
1191 &data_word(0xF689A4F3); # rep movsb # restore tail
1194 &data_word(0xA4F3F689); # rep movsb # restore tail
1192
1193 &align (4);
1194 &set_label("dec_out");
1195
1196 &align (4);
1197 &set_label("dec_out");
1198 &cmp ($mark,0); # was the key schedule copied?
1195 &mov ("edi",$_key);
1196 &mov ("esp",$_esp);
1199 &mov ("edi",$_key);
1200 &mov ("esp",$_esp);
1197 if ($compromise) {
1198 &cmp (&wparam(2),$compromise);
1199 &jb (&label("skip_dzero"));
1200 }
1201 &je (&label("skip_dzero"));
1201 # zero copy of key schedule
1202 &mov ("ecx",240/4);
1203 &xor ("eax","eax");
1204 &align (4);
1202 # zero copy of key schedule
1203 &mov ("ecx",240/4);
1204 &xor ("eax","eax");
1205 &align (4);
1205 &data_word(0xF689ABF3); # rep stosd
1206 &set_label("skip_dzero") if ($compromise);
1206 &data_word(0xABF3F689); # rep stosd
1207 &set_label("skip_dzero")
1207 &popf ();
1208&function_end("AES_cbc_encrypt");
1209}
1210
1211#------------------------------------------------------------------#
1212
1213sub enckey()
1214{

--- 317 unchanged lines hidden ---
1208 &popf ();
1209&function_end("AES_cbc_encrypt");
1210}
1211
1212#------------------------------------------------------------------#
1213
1214sub enckey()
1215{

--- 317 unchanged lines hidden ---