1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. Rights for redistribution and usage in source and binary 6# forms are granted according to the OpenSSL license. 7# ==================================================================== 8# 9# 2.22x RC4 tune-up:-) It should be noted though that my hand [as in 10# "hand-coded assembler"] doesn't stand for the whole improvement 11# coefficient. It turned out that eliminating RC4_CHAR from config 12# line results in ~40% improvement (yes, even for C implementation). 13# Presumably it has everything to do with AMD cache architecture and 14# RAW or whatever penalties. Once again! The module *requires* config 15# line *without* RC4_CHAR! As for coding "secret," I bet on partial 16# register arithmetics. For example instead of 'inc %r8; and $255,%r8' 17# I simply 'inc %r8b'. Even though optimization manual discourages 18# to operate on partial registers, it turned out to be the best bet. 19# At least for AMD... How IA32E would perform remains to be seen... 20 21# As was shown by Marc Bevand reordering of couple of load operations 22# results in even higher performance gain of 3.3x:-) At least on 23# Opteron... For reference, 1x in this case is RC4_CHAR C-code 24# compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock. 25# Latter means that if you want to *estimate* what to expect from 26# *your* CPU, then multiply 54 by 3.3 and clock frequency in GHz. 27 28# Intel P4 EM64T core was found to run the AMD64 code really slow... 29# The only way to achieve comparable performance on P4 is to keep 30# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to 31# compose blended code, which would perform even within 30% marginal 32# on either AMD and Intel platforms, I implement both cases. See 33# rc4_skey.c for further details... This applies to 0.9.8 and later. 34# In 0.9.7 context RC4_CHAR codepath is never engaged and ~70 bytes 35# of code remain redundant. 36 37$output=shift; 38 39$win64a=1 if ($output =~ /win64a.[s|asm]/); 40 41open STDOUT,">$output" || die "can't open $output: $!"; 42 43if (defined($win64a)) { 44 $dat="%rcx"; # arg1 45 $len="%rdx"; # arg2 46 $inp="%rsi"; # r8, arg3 moves here 47 $out="%rdi"; # r9, arg4 moves here 48} else { 49 $dat="%rdi"; # arg1 50 $len="%rsi"; # arg2 51 $inp="%rdx"; # arg3 52 $out="%rcx"; # arg4 53} 54 55$XX="%r10"; 56$TX="%r8"; 57$YY="%r11"; 58$TY="%r9"; 59 60sub PTR() { 61 my $ret=shift; 62 if (defined($win64a)) { 63 $ret =~ s/\[([\S]+)\+([\S]+)\]/[$2+$1]/g; # [%rN+%rM*4]->[%rM*4+%rN] 64 $ret =~ s/:([^\[]+)\[([^\]]+)\]/:[$2+$1]/g; # :off[ea]->:[ea+off] 65 } else { 66 $ret =~ s/[\+\*]/,/g; # [%rN+%rM*4]->[%rN,%rM,4] 67 $ret =~ s/\[([^\]]+)\]/($1)/g; # [%rN]->(%rN) 68 } 69 $ret; 70} 71 72$code=<<___ if (!defined($win64a)); 73.text 74 75.globl RC4 76.type RC4,\@function 77.align 16 78RC4: or $len,$len 79 jne .Lentry 80 repret 81.Lentry: 82___ 83$code=<<___ if (defined($win64a)); 84_TEXT SEGMENT 85PUBLIC RC4 86ALIGN 16 87RC4 PROC 88 or $len,$len 89 jne .Lentry 90 repret 91.Lentry: 92 push %rdi 93 push %rsi 94 sub \$40,%rsp 95 mov %r8,$inp 96 mov %r9,$out 97___ 98$code.=<<___; 99 add \$8,$dat 100 movl `&PTR("DWORD:-8[$dat]")`,$XX#d 101 movl `&PTR("DWORD:-4[$dat]")`,$YY#d 102 cmpl \$-1,`&PTR("DWORD:256[$dat]")` 103 je .LRC4_CHAR 104 test \$-8,$len 105 jz .Lloop1 106.align 16 107.Lloop8: 108 inc $XX#b 109 movl `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d 110 add $TX#b,$YY#b 111 movl `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d 112 movl $TX#d,`&PTR("DWORD:[$dat+$YY*4]")` 113 movl $TY#d,`&PTR("DWORD:[$dat+$XX*4]")` 114 add $TX#b,$TY#b 115 inc $XX#b 116 movl `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d 117 movb `&PTR("BYTE:[$dat+$TY*4]")`,%al 118___ 119for ($i=1;$i<=6;$i++) { 120$code.=<<___; 121 add $TX#b,$YY#b 122 ror \$8,%rax 123 movl `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d 124 movl $TX#d,`&PTR("DWORD:[$dat+$YY*4]")` 125 movl $TY#d,`&PTR("DWORD:[$dat+$XX*4]")` 126 add $TX#b,$TY#b 127 inc $XX#b 128 movl `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d 129 movb `&PTR("BYTE:[$dat+$TY*4]")`,%al 130___ 131} 132$code.=<<___; 133 add $TX#b,$YY#b 134 ror \$8,%rax 135 movl `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d 136 movl $TX#d,`&PTR("DWORD:[$dat+$YY*4]")` 137 movl $TY#d,`&PTR("DWORD:[$dat+$XX*4]")` 138 sub \$8,$len 139 add $TY#b,$TX#b 140 movb `&PTR("BYTE:[$dat+$TX*4]")`,%al 141 ror \$8,%rax 142 add \$8,$inp 143 add \$8,$out 144 145 xor `&PTR("QWORD:-8[$inp]")`,%rax 146 mov %rax,`&PTR("QWORD:-8[$out]")` 147 148 test \$-8,$len 149 jnz .Lloop8 150 cmp \$0,$len 151 jne .Lloop1 152.Lexit: 153 movl $XX#d,`&PTR("DWORD:-8[$dat]")` 154 movl $YY#d,`&PTR("DWORD:-4[$dat]")` 155___ 156$code.=<<___ if (defined($win64a)); 157 add \$40,%rsp 158 pop %rsi 159 pop %rdi 160___ 161$code.=<<___; 162 repret 163.align 16 164.Lloop1: 165 movzb `&PTR("BYTE:[$inp]")`,%eax 166 inc $XX#b 167 movl `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d 168 add $TX#b,$YY#b 169 movl `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d 170 movl $TX#d,`&PTR("DWORD:[$dat+$YY*4]")` 171 movl $TY#d,`&PTR("DWORD:[$dat+$XX*4]")` 172 add $TY#b,$TX#b 173 movl `&PTR("DWORD:[$dat+$TX*4]")`,$TY#d 174 xor $TY,%rax 175 inc $inp 176 movb %al,`&PTR("BYTE:[$out]")` 177 inc $out 178 dec $len 179 jnz .Lloop1 180 jmp .Lexit 181 182.align 16 183.LRC4_CHAR: 184 inc $XX#b 185 movzb `&PTR("BYTE:[$dat+$XX]")`,$TX#d 186 add $TX#b,$YY#b 187 movzb `&PTR("BYTE:[$dat+$YY]")`,$TY#d 188 movb $TX#b,`&PTR("BYTE:[$dat+$YY]")` 189 movb $TY#b,`&PTR("BYTE:[$dat+$XX]")` 190 add $TX#b,$TY#b 191 movzb `&PTR("BYTE:[$dat+$TY]")`,$TY#d 192 xorb `&PTR("BYTE:[$inp]")`,$TY#b 193 movb $TY#b,`&PTR("BYTE:[$out]")` 194 inc $inp 195 inc $out 196 dec $len 197 jnz .LRC4_CHAR 198 jmp .Lexit 199___ 200$code.=<<___ if (defined($win64a)); 201RC4 ENDP 202_TEXT ENDS 203END 204___ 205$code.=<<___ if (!defined($win64a)); 206.size RC4,.-RC4 207___ 208 209$code =~ s/#([bwd])/$1/gm; 210$code =~ s/\`([^\`]*)\`/eval $1/gem; 211 212if (defined($win64a)) { 213 $code =~ s/\.align/ALIGN/gm; 214 $code =~ s/[\$%]//gm; 215 $code =~ s/\.L/\$L/gm; 216 $code =~ s/([\w]+)([\s]+)([\S]+),([\S]+)/$1$2$4,$3/gm; 217 $code =~ s/([QD]*WORD|BYTE):/$1 PTR/gm; 218 $code =~ s/mov[bwlq]/mov/gm; 219 $code =~ s/movzb/movzx/gm; 220 $code =~ s/repret/DB\t0F3h,0C3h/gm; 221 $code =~ s/cmpl/cmp/gm; 222 $code =~ s/xorb/xor/gm; 223} else { 224 $code =~ s/([QD]*WORD|BYTE)://gm; 225 $code =~ s/repret/.byte\t0xF3,0xC3/gm; 226} 227print $code; 228