1238384Sjkim#!/usr/bin/env perl 2238384Sjkim# 3238384Sjkim# ==================================================================== 4238384Sjkim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and 6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further 7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/. 8238384Sjkim# ==================================================================== 9238384Sjkim 10238384Sjkim# June 2011 11238384Sjkim# 12238384Sjkim# This is RC4+MD5 "stitch" implementation. The idea, as spelled in 13238384Sjkim# http://download.intel.com/design/intarch/papers/323686.pdf, is that 14238384Sjkim# since both algorithms exhibit instruction-level parallelism, ILP, 15238384Sjkim# below theoretical maximum, interleaving them would allow to utilize 16238384Sjkim# processor resources better and achieve better performance. RC4 17238384Sjkim# instruction sequence is virtually identical to rc4-x86_64.pl, which 18238384Sjkim# is heavily based on submission by Maxim Perminov, Maxim Locktyukhin 19238384Sjkim# and Jim Guilford of Intel. MD5 is fresh implementation aiming to 20238384Sjkim# minimize register usage, which was used as "main thread" with RC4 21238384Sjkim# weaved into it, one RC4 round per one MD5 round. In addition to the 22238384Sjkim# stiched subroutine the script can generate standalone replacement 23238384Sjkim# md5_block_asm_data_order and RC4. Below are performance numbers in 24238384Sjkim# cycles per processed byte, less is better, for these the standalone 25238384Sjkim# subroutines, sum of them, and stitched one: 26238384Sjkim# 27238384Sjkim# RC4 MD5 RC4+MD5 stitch gain 28238384Sjkim# Opteron 6.5(*) 5.4 11.9 7.0 +70%(*) 29238384Sjkim# Core2 6.5 5.8 12.3 7.7 +60% 30238384Sjkim# Westmere 4.3 5.2 9.5 7.0 +36% 31238384Sjkim# Sandy Bridge 4.2 5.5 9.7 6.8 +43% 32238384Sjkim# Atom 9.3 6.5 15.8 11.1 +42% 33238384Sjkim# 34238384Sjkim# (*) rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement 35238384Sjkim# is +53%... 36238384Sjkim 37238384Sjkimmy ($rc4,$md5)=(1,1); # what to generate? 38238384Sjkimmy $D="#" if (!$md5); # if set to "#", MD5 is stitched into RC4(), 39238384Sjkim # but its result is discarded. Idea here is 40238384Sjkim # to be able to use 'openssl speed rc4' for 41238384Sjkim # benchmarking the stitched subroutine... 42238384Sjkim 43238384Sjkimmy $flavour = shift; 44238384Sjkimmy $output = shift; 45238384Sjkimif ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 46238384Sjkim 47238384Sjkimmy $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 48238384Sjkim 49238384Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate; 50238384Sjkim( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 51238384Sjkim( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 52238384Sjkimdie "can't locate x86_64-xlate.pl"; 53238384Sjkim 54246772Sjkimopen OUT,"| \"$^X\" $xlate $flavour $output"; 55246772Sjkim*STDOUT=*OUT; 56238384Sjkim 57238384Sjkimmy ($dat,$in0,$out,$ctx,$inp,$len, $func,$nargs); 58238384Sjkim 59238384Sjkimif ($rc4 && !$md5) { 60238384Sjkim ($dat,$len,$in0,$out) = ("%rdi","%rsi","%rdx","%rcx"); 61238384Sjkim $func="RC4"; $nargs=4; 62238384Sjkim} elsif ($md5 && !$rc4) { 63238384Sjkim ($ctx,$inp,$len) = ("%rdi","%rsi","%rdx"); 64238384Sjkim $func="md5_block_asm_data_order"; $nargs=3; 65238384Sjkim} else { 66238384Sjkim ($dat,$in0,$out,$ctx,$inp,$len) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); 67238384Sjkim $func="rc4_md5_enc"; $nargs=6; 68238384Sjkim # void rc4_md5_enc( 69238384Sjkim # RC4_KEY *key, # 70238384Sjkim # const void *in0, # RC4 input 71238384Sjkim # void *out, # RC4 output 72238384Sjkim # MD5_CTX *ctx, # 73238384Sjkim # const void *inp, # MD5 input 74238384Sjkim # size_t len); # number of 64-byte blocks 75238384Sjkim} 76238384Sjkim 77238384Sjkimmy @K=( 0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee, 78238384Sjkim 0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501, 79238384Sjkim 0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be, 80238384Sjkim 0x6b901122,0xfd987193,0xa679438e,0x49b40821, 81238384Sjkim 82238384Sjkim 0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa, 83238384Sjkim 0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8, 84238384Sjkim 0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed, 85238384Sjkim 0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a, 86238384Sjkim 87238384Sjkim 0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c, 88238384Sjkim 0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70, 89238384Sjkim 0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05, 90238384Sjkim 0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665, 91238384Sjkim 92238384Sjkim 0xf4292244,0x432aff97,0xab9423a7,0xfc93a039, 93238384Sjkim 0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1, 94238384Sjkim 0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1, 95238384Sjkim 0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391 ); 96238384Sjkim 97238384Sjkimmy @V=("%r8d","%r9d","%r10d","%r11d"); # MD5 registers 98238384Sjkimmy $tmp="%r12d"; 99238384Sjkim 100238384Sjkimmy @XX=("%rbp","%rsi"); # RC4 registers 101238384Sjkimmy @TX=("%rax","%rbx"); 102238384Sjkimmy $YY="%rcx"; 103238384Sjkimmy $TY="%rdx"; 104238384Sjkim 105238384Sjkimmy $MOD=32; # 16, 32 or 64 106238384Sjkim 107238384Sjkim$code.=<<___; 108238384Sjkim.text 109238384Sjkim.align 16 110238384Sjkim 111238384Sjkim.globl $func 112238384Sjkim.type $func,\@function,$nargs 113238384Sjkim$func: 114238384Sjkim cmp \$0,$len 115238384Sjkim je .Labort 116238384Sjkim push %rbx 117238384Sjkim push %rbp 118238384Sjkim push %r12 119238384Sjkim push %r13 120238384Sjkim push %r14 121238384Sjkim push %r15 122238384Sjkim sub \$40,%rsp 123238384Sjkim.Lbody: 124238384Sjkim___ 125238384Sjkimif ($rc4) { 126238384Sjkim$code.=<<___; 127238384Sjkim$D#md5# mov $ctx,%r11 # reassign arguments 128238384Sjkim mov $len,%r12 129238384Sjkim mov $in0,%r13 130238384Sjkim mov $out,%r14 131238384Sjkim$D#md5# mov $inp,%r15 132238384Sjkim___ 133238384Sjkim $ctx="%r11" if ($md5); # reassign arguments 134238384Sjkim $len="%r12"; 135238384Sjkim $in0="%r13"; 136238384Sjkim $out="%r14"; 137238384Sjkim $inp="%r15" if ($md5); 138238384Sjkim $inp=$in0 if (!$md5); 139238384Sjkim$code.=<<___; 140238384Sjkim xor $XX[0],$XX[0] 141238384Sjkim xor $YY,$YY 142238384Sjkim 143238384Sjkim lea 8($dat),$dat 144238384Sjkim mov -8($dat),$XX[0]#b 145238384Sjkim mov -4($dat),$YY#b 146238384Sjkim 147238384Sjkim inc $XX[0]#b 148238384Sjkim sub $in0,$out 149238384Sjkim movl ($dat,$XX[0],4),$TX[0]#d 150238384Sjkim___ 151238384Sjkim$code.=<<___ if (!$md5); 152238384Sjkim xor $TX[1],$TX[1] 153238384Sjkim test \$-128,$len 154238384Sjkim jz .Loop1 155238384Sjkim sub $XX[0],$TX[1] 156238384Sjkim and \$`$MOD-1`,$TX[1] 157238384Sjkim jz .Loop${MOD}_is_hot 158238384Sjkim sub $TX[1],$len 159238384Sjkim.Loop${MOD}_warmup: 160238384Sjkim add $TX[0]#b,$YY#b 161238384Sjkim movl ($dat,$YY,4),$TY#d 162238384Sjkim movl $TX[0]#d,($dat,$YY,4) 163238384Sjkim movl $TY#d,($dat,$XX[0],4) 164238384Sjkim add $TY#b,$TX[0]#b 165238384Sjkim inc $XX[0]#b 166238384Sjkim movl ($dat,$TX[0],4),$TY#d 167238384Sjkim movl ($dat,$XX[0],4),$TX[0]#d 168238384Sjkim xorb ($in0),$TY#b 169238384Sjkim movb $TY#b,($out,$in0) 170238384Sjkim lea 1($in0),$in0 171238384Sjkim dec $TX[1] 172238384Sjkim jnz .Loop${MOD}_warmup 173238384Sjkim 174238384Sjkim mov $YY,$TX[1] 175238384Sjkim xor $YY,$YY 176238384Sjkim mov $TX[1]#b,$YY#b 177238384Sjkim 178238384Sjkim.Loop${MOD}_is_hot: 179238384Sjkim mov $len,32(%rsp) # save original $len 180238384Sjkim shr \$6,$len # number of 64-byte blocks 181238384Sjkim___ 182238384Sjkim if ($D && !$md5) { # stitch in dummy MD5 183238384Sjkim $md5=1; 184238384Sjkim $ctx="%r11"; 185238384Sjkim $inp="%r15"; 186238384Sjkim $code.=<<___; 187238384Sjkim mov %rsp,$ctx 188238384Sjkim mov $in0,$inp 189238384Sjkim___ 190238384Sjkim } 191238384Sjkim} 192238384Sjkim$code.=<<___; 193238384Sjkim#rc4# add $TX[0]#b,$YY#b 194238384Sjkim#rc4# lea ($dat,$XX[0],4),$XX[1] 195238384Sjkim shl \$6,$len 196238384Sjkim add $inp,$len # pointer to the end of input 197238384Sjkim mov $len,16(%rsp) 198238384Sjkim 199238384Sjkim#md5# mov $ctx,24(%rsp) # save pointer to MD5_CTX 200238384Sjkim#md5# mov 0*4($ctx),$V[0] # load current hash value from MD5_CTX 201238384Sjkim#md5# mov 1*4($ctx),$V[1] 202238384Sjkim#md5# mov 2*4($ctx),$V[2] 203238384Sjkim#md5# mov 3*4($ctx),$V[3] 204238384Sjkim jmp .Loop 205238384Sjkim 206238384Sjkim.align 16 207238384Sjkim.Loop: 208238384Sjkim#md5# mov $V[0],0*4(%rsp) # put aside current hash value 209238384Sjkim#md5# mov $V[1],1*4(%rsp) 210238384Sjkim#md5# mov $V[2],2*4(%rsp) 211238384Sjkim#md5# mov $V[3],$tmp # forward reference 212238384Sjkim#md5# mov $V[3],3*4(%rsp) 213238384Sjkim___ 214238384Sjkim 215238384Sjkimsub R0 { 216238384Sjkim my ($i,$a,$b,$c,$d)=@_; 217238384Sjkim my @rot0=(7,12,17,22); 218238384Sjkim my $j=$i%16; 219238384Sjkim my $k=$i%$MOD; 220238384Sjkim my $xmm="%xmm".($j&1); 221238384Sjkim $code.=" movdqu ($in0),%xmm2\n" if ($rc4 && $j==15); 222238384Sjkim $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); 223238384Sjkim $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); 224238384Sjkim $code.=<<___; 225238384Sjkim#rc4# movl ($dat,$YY,4),$TY#d 226238384Sjkim#md5# xor $c,$tmp 227238384Sjkim#rc4# movl $TX[0]#d,($dat,$YY,4) 228238384Sjkim#md5# and $b,$tmp 229238384Sjkim#md5# add 4*`$j`($inp),$a 230238384Sjkim#rc4# add $TY#b,$TX[0]#b 231238384Sjkim#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d 232238384Sjkim#md5# add \$$K[$i],$a 233238384Sjkim#md5# xor $d,$tmp 234238384Sjkim#rc4# movz $TX[0]#b,$TX[0]#d 235238384Sjkim#rc4# movl $TY#d,4*$k($XX[1]) 236238384Sjkim#md5# add $tmp,$a 237238384Sjkim#rc4# add $TX[1]#b,$YY#b 238238384Sjkim#md5# rol \$$rot0[$j%4],$a 239238384Sjkim#md5# mov `$j==15?"$b":"$c"`,$tmp # forward reference 240238384Sjkim#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n 241238384Sjkim#md5# add $b,$a 242238384Sjkim___ 243238384Sjkim $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1); 244238384Sjkim mov $YY,$XX[1] 245238384Sjkim xor $YY,$YY # keyword to partial register 246238384Sjkim mov $XX[1]#b,$YY#b 247238384Sjkim lea ($dat,$XX[0],4),$XX[1] 248238384Sjkim___ 249238384Sjkim $code.=<<___ if ($rc4 && $j==15); 250238384Sjkim psllq \$8,%xmm1 251238384Sjkim pxor %xmm0,%xmm2 252238384Sjkim pxor %xmm1,%xmm2 253238384Sjkim___ 254238384Sjkim} 255238384Sjkimsub R1 { 256238384Sjkim my ($i,$a,$b,$c,$d)=@_; 257238384Sjkim my @rot1=(5,9,14,20); 258238384Sjkim my $j=$i%16; 259238384Sjkim my $k=$i%$MOD; 260238384Sjkim my $xmm="%xmm".($j&1); 261238384Sjkim $code.=" movdqu 16($in0),%xmm3\n" if ($rc4 && $j==15); 262238384Sjkim $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); 263238384Sjkim $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); 264238384Sjkim $code.=<<___; 265238384Sjkim#rc4# movl ($dat,$YY,4),$TY#d 266238384Sjkim#md5# xor $b,$tmp 267238384Sjkim#rc4# movl $TX[0]#d,($dat,$YY,4) 268238384Sjkim#md5# and $d,$tmp 269238384Sjkim#md5# add 4*`((1+5*$j)%16)`($inp),$a 270238384Sjkim#rc4# add $TY#b,$TX[0]#b 271238384Sjkim#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d 272238384Sjkim#md5# add \$$K[$i],$a 273238384Sjkim#md5# xor $c,$tmp 274238384Sjkim#rc4# movz $TX[0]#b,$TX[0]#d 275238384Sjkim#rc4# movl $TY#d,4*$k($XX[1]) 276238384Sjkim#md5# add $tmp,$a 277238384Sjkim#rc4# add $TX[1]#b,$YY#b 278238384Sjkim#md5# rol \$$rot1[$j%4],$a 279238384Sjkim#md5# mov `$j==15?"$c":"$b"`,$tmp # forward reference 280238384Sjkim#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n 281238384Sjkim#md5# add $b,$a 282238384Sjkim___ 283238384Sjkim $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1); 284238384Sjkim mov $YY,$XX[1] 285238384Sjkim xor $YY,$YY # keyword to partial register 286238384Sjkim mov $XX[1]#b,$YY#b 287238384Sjkim lea ($dat,$XX[0],4),$XX[1] 288238384Sjkim___ 289238384Sjkim $code.=<<___ if ($rc4 && $j==15); 290238384Sjkim psllq \$8,%xmm1 291238384Sjkim pxor %xmm0,%xmm3 292238384Sjkim pxor %xmm1,%xmm3 293238384Sjkim___ 294238384Sjkim} 295238384Sjkimsub R2 { 296238384Sjkim my ($i,$a,$b,$c,$d)=@_; 297238384Sjkim my @rot2=(4,11,16,23); 298238384Sjkim my $j=$i%16; 299238384Sjkim my $k=$i%$MOD; 300238384Sjkim my $xmm="%xmm".($j&1); 301238384Sjkim $code.=" movdqu 32($in0),%xmm4\n" if ($rc4 && $j==15); 302238384Sjkim $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); 303238384Sjkim $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); 304238384Sjkim $code.=<<___; 305238384Sjkim#rc4# movl ($dat,$YY,4),$TY#d 306238384Sjkim#md5# xor $c,$tmp 307238384Sjkim#rc4# movl $TX[0]#d,($dat,$YY,4) 308238384Sjkim#md5# xor $b,$tmp 309238384Sjkim#md5# add 4*`((5+3*$j)%16)`($inp),$a 310238384Sjkim#rc4# add $TY#b,$TX[0]#b 311238384Sjkim#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d 312238384Sjkim#md5# add \$$K[$i],$a 313238384Sjkim#rc4# movz $TX[0]#b,$TX[0]#d 314238384Sjkim#md5# add $tmp,$a 315238384Sjkim#rc4# movl $TY#d,4*$k($XX[1]) 316238384Sjkim#rc4# add $TX[1]#b,$YY#b 317238384Sjkim#md5# rol \$$rot2[$j%4],$a 318238384Sjkim#md5# mov `$j==15?"\\\$-1":"$c"`,$tmp # forward reference 319238384Sjkim#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n 320238384Sjkim#md5# add $b,$a 321238384Sjkim___ 322238384Sjkim $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1); 323238384Sjkim mov $YY,$XX[1] 324238384Sjkim xor $YY,$YY # keyword to partial register 325238384Sjkim mov $XX[1]#b,$YY#b 326238384Sjkim lea ($dat,$XX[0],4),$XX[1] 327238384Sjkim___ 328238384Sjkim $code.=<<___ if ($rc4 && $j==15); 329238384Sjkim psllq \$8,%xmm1 330238384Sjkim pxor %xmm0,%xmm4 331238384Sjkim pxor %xmm1,%xmm4 332238384Sjkim___ 333238384Sjkim} 334238384Sjkimsub R3 { 335238384Sjkim my ($i,$a,$b,$c,$d)=@_; 336238384Sjkim my @rot3=(6,10,15,21); 337238384Sjkim my $j=$i%16; 338238384Sjkim my $k=$i%$MOD; 339238384Sjkim my $xmm="%xmm".($j&1); 340238384Sjkim $code.=" movdqu 48($in0),%xmm5\n" if ($rc4 && $j==15); 341238384Sjkim $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); 342238384Sjkim $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); 343238384Sjkim $code.=<<___; 344238384Sjkim#rc4# movl ($dat,$YY,4),$TY#d 345238384Sjkim#md5# xor $d,$tmp 346238384Sjkim#rc4# movl $TX[0]#d,($dat,$YY,4) 347238384Sjkim#md5# or $b,$tmp 348238384Sjkim#md5# add 4*`((7*$j)%16)`($inp),$a 349238384Sjkim#rc4# add $TY#b,$TX[0]#b 350238384Sjkim#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d 351238384Sjkim#md5# add \$$K[$i],$a 352238384Sjkim#rc4# movz $TX[0]#b,$TX[0]#d 353238384Sjkim#md5# xor $c,$tmp 354238384Sjkim#rc4# movl $TY#d,4*$k($XX[1]) 355238384Sjkim#md5# add $tmp,$a 356238384Sjkim#rc4# add $TX[1]#b,$YY#b 357238384Sjkim#md5# rol \$$rot3[$j%4],$a 358238384Sjkim#md5# mov \$-1,$tmp # forward reference 359238384Sjkim#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n 360238384Sjkim#md5# add $b,$a 361238384Sjkim___ 362238384Sjkim $code.=<<___ if ($rc4 && $j==15); 363238384Sjkim mov $XX[0],$XX[1] 364238384Sjkim xor $XX[0],$XX[0] # keyword to partial register 365238384Sjkim mov $XX[1]#b,$XX[0]#b 366238384Sjkim mov $YY,$XX[1] 367238384Sjkim xor $YY,$YY # keyword to partial register 368238384Sjkim mov $XX[1]#b,$YY#b 369238384Sjkim lea ($dat,$XX[0],4),$XX[1] 370238384Sjkim psllq \$8,%xmm1 371238384Sjkim pxor %xmm0,%xmm5 372238384Sjkim pxor %xmm1,%xmm5 373238384Sjkim___ 374238384Sjkim} 375238384Sjkim 376238384Sjkimmy $i=0; 377238384Sjkimfor(;$i<16;$i++) { R0($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } 378238384Sjkimfor(;$i<32;$i++) { R1($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } 379238384Sjkimfor(;$i<48;$i++) { R2($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } 380238384Sjkimfor(;$i<64;$i++) { R3($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } 381238384Sjkim 382238384Sjkim$code.=<<___; 383238384Sjkim#md5# add 0*4(%rsp),$V[0] # accumulate hash value 384238384Sjkim#md5# add 1*4(%rsp),$V[1] 385238384Sjkim#md5# add 2*4(%rsp),$V[2] 386238384Sjkim#md5# add 3*4(%rsp),$V[3] 387238384Sjkim 388238384Sjkim#rc4# movdqu %xmm2,($out,$in0) # write RC4 output 389238384Sjkim#rc4# movdqu %xmm3,16($out,$in0) 390238384Sjkim#rc4# movdqu %xmm4,32($out,$in0) 391238384Sjkim#rc4# movdqu %xmm5,48($out,$in0) 392238384Sjkim#md5# lea 64($inp),$inp 393238384Sjkim#rc4# lea 64($in0),$in0 394238384Sjkim cmp 16(%rsp),$inp # are we done? 395238384Sjkim jb .Loop 396238384Sjkim 397238384Sjkim#md5# mov 24(%rsp),$len # restore pointer to MD5_CTX 398238384Sjkim#rc4# sub $TX[0]#b,$YY#b # correct $YY 399238384Sjkim#md5# mov $V[0],0*4($len) # write MD5_CTX 400238384Sjkim#md5# mov $V[1],1*4($len) 401238384Sjkim#md5# mov $V[2],2*4($len) 402238384Sjkim#md5# mov $V[3],3*4($len) 403238384Sjkim___ 404238384Sjkim$code.=<<___ if ($rc4 && (!$md5 || $D)); 405238384Sjkim mov 32(%rsp),$len # restore original $len 406238384Sjkim and \$63,$len # remaining bytes 407238384Sjkim jnz .Loop1 408238384Sjkim jmp .Ldone 409238384Sjkim 410238384Sjkim.align 16 411238384Sjkim.Loop1: 412238384Sjkim add $TX[0]#b,$YY#b 413238384Sjkim movl ($dat,$YY,4),$TY#d 414238384Sjkim movl $TX[0]#d,($dat,$YY,4) 415238384Sjkim movl $TY#d,($dat,$XX[0],4) 416238384Sjkim add $TY#b,$TX[0]#b 417238384Sjkim inc $XX[0]#b 418238384Sjkim movl ($dat,$TX[0],4),$TY#d 419238384Sjkim movl ($dat,$XX[0],4),$TX[0]#d 420238384Sjkim xorb ($in0),$TY#b 421238384Sjkim movb $TY#b,($out,$in0) 422238384Sjkim lea 1($in0),$in0 423238384Sjkim dec $len 424238384Sjkim jnz .Loop1 425238384Sjkim 426238384Sjkim.Ldone: 427238384Sjkim___ 428238384Sjkim$code.=<<___; 429238384Sjkim#rc4# sub \$1,$XX[0]#b 430238384Sjkim#rc4# movl $XX[0]#d,-8($dat) 431238384Sjkim#rc4# movl $YY#d,-4($dat) 432238384Sjkim 433238384Sjkim mov 40(%rsp),%r15 434238384Sjkim mov 48(%rsp),%r14 435238384Sjkim mov 56(%rsp),%r13 436238384Sjkim mov 64(%rsp),%r12 437238384Sjkim mov 72(%rsp),%rbp 438238384Sjkim mov 80(%rsp),%rbx 439238384Sjkim lea 88(%rsp),%rsp 440238384Sjkim.Lepilogue: 441238384Sjkim.Labort: 442238384Sjkim ret 443238384Sjkim.size $func,.-$func 444238384Sjkim___ 445238384Sjkim 446238384Sjkimif ($rc4 && $D) { # sole purpose of this section is to provide 447238384Sjkim # option to use the generated module as drop-in 448238384Sjkim # replacement for rc4-x86_64.pl for debugging 449238384Sjkim # and testing purposes... 450238384Sjkimmy ($idx,$ido)=("%r8","%r9"); 451238384Sjkimmy ($dat,$len,$inp)=("%rdi","%rsi","%rdx"); 452238384Sjkim 453238384Sjkim$code.=<<___; 454238384Sjkim.globl RC4_set_key 455238384Sjkim.type RC4_set_key,\@function,3 456238384Sjkim.align 16 457238384SjkimRC4_set_key: 458238384Sjkim lea 8($dat),$dat 459238384Sjkim lea ($inp,$len),$inp 460238384Sjkim neg $len 461238384Sjkim mov $len,%rcx 462238384Sjkim xor %eax,%eax 463238384Sjkim xor $ido,$ido 464238384Sjkim xor %r10,%r10 465238384Sjkim xor %r11,%r11 466238384Sjkim jmp .Lw1stloop 467238384Sjkim 468238384Sjkim.align 16 469238384Sjkim.Lw1stloop: 470238384Sjkim mov %eax,($dat,%rax,4) 471238384Sjkim add \$1,%al 472238384Sjkim jnc .Lw1stloop 473238384Sjkim 474238384Sjkim xor $ido,$ido 475238384Sjkim xor $idx,$idx 476238384Sjkim.align 16 477238384Sjkim.Lw2ndloop: 478238384Sjkim mov ($dat,$ido,4),%r10d 479238384Sjkim add ($inp,$len,1),$idx#b 480238384Sjkim add %r10b,$idx#b 481238384Sjkim add \$1,$len 482238384Sjkim mov ($dat,$idx,4),%r11d 483238384Sjkim cmovz %rcx,$len 484238384Sjkim mov %r10d,($dat,$idx,4) 485238384Sjkim mov %r11d,($dat,$ido,4) 486238384Sjkim add \$1,$ido#b 487238384Sjkim jnc .Lw2ndloop 488238384Sjkim 489238384Sjkim xor %eax,%eax 490238384Sjkim mov %eax,-8($dat) 491238384Sjkim mov %eax,-4($dat) 492238384Sjkim ret 493238384Sjkim.size RC4_set_key,.-RC4_set_key 494238384Sjkim 495238384Sjkim.globl RC4_options 496238384Sjkim.type RC4_options,\@abi-omnipotent 497238384Sjkim.align 16 498238384SjkimRC4_options: 499238384Sjkim lea .Lopts(%rip),%rax 500238384Sjkim ret 501238384Sjkim.align 64 502238384Sjkim.Lopts: 503238384Sjkim.asciz "rc4(64x,int)" 504238384Sjkim.align 64 505238384Sjkim.size RC4_options,.-RC4_options 506238384Sjkim___ 507238384Sjkim} 508238384Sjkim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 509238384Sjkim# CONTEXT *context,DISPATCHER_CONTEXT *disp) 510238384Sjkimif ($win64) { 511238384Sjkimmy $rec="%rcx"; 512238384Sjkimmy $frame="%rdx"; 513238384Sjkimmy $context="%r8"; 514238384Sjkimmy $disp="%r9"; 515238384Sjkim 516238384Sjkim$code.=<<___; 517238384Sjkim.extern __imp_RtlVirtualUnwind 518238384Sjkim.type se_handler,\@abi-omnipotent 519238384Sjkim.align 16 520238384Sjkimse_handler: 521238384Sjkim push %rsi 522238384Sjkim push %rdi 523238384Sjkim push %rbx 524238384Sjkim push %rbp 525238384Sjkim push %r12 526238384Sjkim push %r13 527238384Sjkim push %r14 528238384Sjkim push %r15 529238384Sjkim pushfq 530238384Sjkim sub \$64,%rsp 531238384Sjkim 532238384Sjkim mov 120($context),%rax # pull context->Rax 533238384Sjkim mov 248($context),%rbx # pull context->Rip 534238384Sjkim 535238384Sjkim lea .Lbody(%rip),%r10 536238384Sjkim cmp %r10,%rbx # context->Rip<.Lbody 537238384Sjkim jb .Lin_prologue 538238384Sjkim 539238384Sjkim mov 152($context),%rax # pull context->Rsp 540238384Sjkim 541238384Sjkim lea .Lepilogue(%rip),%r10 542238384Sjkim cmp %r10,%rbx # context->Rip>=.Lepilogue 543238384Sjkim jae .Lin_prologue 544238384Sjkim 545238384Sjkim mov 40(%rax),%r15 546238384Sjkim mov 48(%rax),%r14 547238384Sjkim mov 56(%rax),%r13 548238384Sjkim mov 64(%rax),%r12 549238384Sjkim mov 72(%rax),%rbp 550238384Sjkim mov 80(%rax),%rbx 551238384Sjkim lea 88(%rax),%rax 552238384Sjkim 553238384Sjkim mov %rbx,144($context) # restore context->Rbx 554238384Sjkim mov %rbp,160($context) # restore context->Rbp 555238384Sjkim mov %r12,216($context) # restore context->R12 556238384Sjkim mov %r13,224($context) # restore context->R12 557238384Sjkim mov %r14,232($context) # restore context->R14 558238384Sjkim mov %r15,240($context) # restore context->R15 559238384Sjkim 560238384Sjkim.Lin_prologue: 561238384Sjkim mov 8(%rax),%rdi 562238384Sjkim mov 16(%rax),%rsi 563238384Sjkim mov %rax,152($context) # restore context->Rsp 564238384Sjkim mov %rsi,168($context) # restore context->Rsi 565238384Sjkim mov %rdi,176($context) # restore context->Rdi 566238384Sjkim 567238384Sjkim mov 40($disp),%rdi # disp->ContextRecord 568238384Sjkim mov $context,%rsi # context 569238384Sjkim mov \$154,%ecx # sizeof(CONTEXT) 570238384Sjkim .long 0xa548f3fc # cld; rep movsq 571238384Sjkim 572238384Sjkim mov $disp,%rsi 573238384Sjkim xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 574238384Sjkim mov 8(%rsi),%rdx # arg2, disp->ImageBase 575238384Sjkim mov 0(%rsi),%r8 # arg3, disp->ControlPc 576238384Sjkim mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 577238384Sjkim mov 40(%rsi),%r10 # disp->ContextRecord 578238384Sjkim lea 56(%rsi),%r11 # &disp->HandlerData 579238384Sjkim lea 24(%rsi),%r12 # &disp->EstablisherFrame 580238384Sjkim mov %r10,32(%rsp) # arg5 581238384Sjkim mov %r11,40(%rsp) # arg6 582238384Sjkim mov %r12,48(%rsp) # arg7 583238384Sjkim mov %rcx,56(%rsp) # arg8, (NULL) 584238384Sjkim call *__imp_RtlVirtualUnwind(%rip) 585238384Sjkim 586238384Sjkim mov \$1,%eax # ExceptionContinueSearch 587238384Sjkim add \$64,%rsp 588238384Sjkim popfq 589238384Sjkim pop %r15 590238384Sjkim pop %r14 591238384Sjkim pop %r13 592238384Sjkim pop %r12 593238384Sjkim pop %rbp 594238384Sjkim pop %rbx 595238384Sjkim pop %rdi 596238384Sjkim pop %rsi 597238384Sjkim ret 598238384Sjkim.size se_handler,.-se_handler 599238384Sjkim 600238384Sjkim.section .pdata 601238384Sjkim.align 4 602238384Sjkim .rva .LSEH_begin_$func 603238384Sjkim .rva .LSEH_end_$func 604238384Sjkim .rva .LSEH_info_$func 605238384Sjkim 606238384Sjkim.section .xdata 607238384Sjkim.align 8 608238384Sjkim.LSEH_info_$func: 609238384Sjkim .byte 9,0,0,0 610238384Sjkim .rva se_handler 611238384Sjkim___ 612238384Sjkim} 613238384Sjkim 614238384Sjkimsub reg_part { 615238384Sjkimmy ($reg,$conv)=@_; 616238384Sjkim if ($reg =~ /%r[0-9]+/) { $reg .= $conv; } 617238384Sjkim elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; } 618238384Sjkim elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; } 619238384Sjkim elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; } 620238384Sjkim return $reg; 621238384Sjkim} 622238384Sjkim 623238384Sjkim$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem; 624238384Sjkim$code =~ s/\`([^\`]*)\`/eval $1/gem; 625238384Sjkim$code =~ s/pinsrw\s+\$0,/movd /gm; 626238384Sjkim 627238384Sjkim$code =~ s/#md5#//gm if ($md5); 628238384Sjkim$code =~ s/#rc4#//gm if ($rc4); 629238384Sjkim 630238384Sjkimprint $code; 631238384Sjkim 632238384Sjkimclose STDOUT; 633