1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. Rights for redistribution and usage in source and binary 6# forms are granted according to the OpenSSL license. 7# ==================================================================== 8# 9# 2.22x RC4 tune-up:-) It should be noted though that my hand [as in 10# "hand-coded assembler"] doesn't stand for the whole improvement 11# coefficient. It turned out that eliminating RC4_CHAR from config 12# line results in ~40% improvement (yes, even for C implementation). 13# Presumably it has everything to do with AMD cache architecture and 14# RAW or whatever penalties. Once again! The module *requires* config 15# line *without* RC4_CHAR! As for coding "secret," I bet on partial 16# register arithmetics. For example instead of 'inc %r8; and $255,%r8' 17# I simply 'inc %r8b'. Even though optimization manual discourages 18# to operate on partial registers, it turned out to be the best bet. 19# At least for AMD... How IA32E would perform remains to be seen... 20 21# As was shown by Marc Bevand reordering of couple of load operations 22# results in even higher performance gain of 3.3x:-) At least on 23# Opteron... For reference, 1x in this case is RC4_CHAR C-code 24# compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock. 25# Latter means that if you want to *estimate* what to expect from 26# *your* Opteron, then multiply 54 by 3.3 and clock frequency in GHz. 27 28# Intel P4 EM64T core was found to run the AMD64 code really slow... 29# The only way to achieve comparable performance on P4 was to keep 30# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to 31# compose blended code, which would perform even within 30% marginal 32# on either AMD and Intel platforms, I implement both cases. See 33# rc4_skey.c for further details... 34 35# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing 36# those with add/sub results in 50% performance improvement of folded 37# loop... 38 39# As was shown by Zou Nanhai loop unrolling can improve Intel EM64T 40# performance by >30% [unlike P4 32-bit case that is]. But this is 41# provided that loads are reordered even more aggressively! Both code 42# pathes, AMD64 and EM64T, reorder loads in essentially same manner 43# as my IA-64 implementation. On Opteron this resulted in modest 5% 44# improvement [I had to test it], while final Intel P4 performance 45# achieves respectful 432MBps on 2.8GHz processor now. For reference. 46# If executed on Xeon, current RC4_CHAR code-path is 2.7x faster than 47# RC4_INT code-path. While if executed on Opteron, it's only 25% 48# slower than the RC4_INT one [meaning that if CPU �-arch detection 49# is not implemented, then this final RC4_CHAR code-path should be 50# preferred, as it provides better *all-round* performance]. 51 52$output=shift; 53open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output"; 54 55$dat="%rdi"; # arg1 56$len="%rsi"; # arg2 57$inp="%rdx"; # arg3 58$out="%rcx"; # arg4 59 60@XX=("%r8","%r10"); 61@TX=("%r9","%r11"); 62$YY="%r12"; 63$TY="%r13"; 64 65$code=<<___; 66.text 67 68.globl RC4 69.type RC4,\@function,4 70.align 16 71RC4: or $len,$len 72 jne .Lentry 73 ret 74.Lentry: 75 push %r12 76 push %r13 77 78 add \$8,$dat 79 movl -8($dat),$XX[0]#d 80 movl -4($dat),$YY#d 81 cmpl \$-1,256($dat) 82 je .LRC4_CHAR 83 inc $XX[0]#b 84 movl ($dat,$XX[0],4),$TX[0]#d 85 test \$-8,$len 86 jz .Lloop1 87 jmp .Lloop8 88.align 16 89.Lloop8: 90___ 91for ($i=0;$i<8;$i++) { 92$code.=<<___; 93 add $TX[0]#b,$YY#b 94 mov $XX[0],$XX[1] 95 movl ($dat,$YY,4),$TY#d 96 ror \$8,%rax # ror is redundant when $i=0 97 inc $XX[1]#b 98 movl ($dat,$XX[1],4),$TX[1]#d 99 cmp $XX[1],$YY 100 movl $TX[0]#d,($dat,$YY,4) 101 cmove $TX[0],$TX[1] 102 movl $TY#d,($dat,$XX[0],4) 103 add $TX[0]#b,$TY#b 104 movb ($dat,$TY,4),%al 105___ 106push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers 107} 108$code.=<<___; 109 ror \$8,%rax 110 sub \$8,$len 111 112 xor ($inp),%rax 113 add \$8,$inp 114 mov %rax,($out) 115 add \$8,$out 116 117 test \$-8,$len 118 jnz .Lloop8 119 cmp \$0,$len 120 jne .Lloop1 121___ 122$code.=<<___; 123.Lexit: 124 sub \$1,$XX[0]#b 125 movl $XX[0]#d,-8($dat) 126 movl $YY#d,-4($dat) 127 128 pop %r13 129 pop %r12 130 ret 131.align 16 132.Lloop1: 133 add $TX[0]#b,$YY#b 134 movl ($dat,$YY,4),$TY#d 135 movl $TX[0]#d,($dat,$YY,4) 136 movl $TY#d,($dat,$XX[0],4) 137 add $TY#b,$TX[0]#b 138 inc $XX[0]#b 139 movl ($dat,$TX[0],4),$TY#d 140 movl ($dat,$XX[0],4),$TX[0]#d 141 xorb ($inp),$TY#b 142 inc $inp 143 movb $TY#b,($out) 144 inc $out 145 dec $len 146 jnz .Lloop1 147 jmp .Lexit 148 149.align 16 150.LRC4_CHAR: 151 add \$1,$XX[0]#b 152 movzb ($dat,$XX[0]),$TX[0]#d 153 test \$-8,$len 154 jz .Lcloop1 155 push %rbx 156 jmp .Lcloop8 157.align 16 158.Lcloop8: 159 mov ($inp),%eax 160 mov 4($inp),%ebx 161___ 162# unroll 2x4-wise, because 64-bit rotates kill Intel P4... 163for ($i=0;$i<4;$i++) { 164$code.=<<___; 165 add $TX[0]#b,$YY#b 166 lea 1($XX[0]),$XX[1] 167 movzb ($dat,$YY),$TY#d 168 movzb $XX[1]#b,$XX[1]#d 169 movzb ($dat,$XX[1]),$TX[1]#d 170 movb $TX[0]#b,($dat,$YY) 171 cmp $XX[1],$YY 172 movb $TY#b,($dat,$XX[0]) 173 jne .Lcmov$i # Intel cmov is sloooow... 174 mov $TX[0],$TX[1] 175.Lcmov$i: 176 add $TX[0]#b,$TY#b 177 xor ($dat,$TY),%al 178 ror \$8,%eax 179___ 180push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers 181} 182for ($i=4;$i<8;$i++) { 183$code.=<<___; 184 add $TX[0]#b,$YY#b 185 lea 1($XX[0]),$XX[1] 186 movzb ($dat,$YY),$TY#d 187 movzb $XX[1]#b,$XX[1]#d 188 movzb ($dat,$XX[1]),$TX[1]#d 189 movb $TX[0]#b,($dat,$YY) 190 cmp $XX[1],$YY 191 movb $TY#b,($dat,$XX[0]) 192 jne .Lcmov$i # Intel cmov is sloooow... 193 mov $TX[0],$TX[1] 194.Lcmov$i: 195 add $TX[0]#b,$TY#b 196 xor ($dat,$TY),%bl 197 ror \$8,%ebx 198___ 199push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers 200} 201$code.=<<___; 202 lea -8($len),$len 203 mov %eax,($out) 204 lea 8($inp),$inp 205 mov %ebx,4($out) 206 lea 8($out),$out 207 208 test \$-8,$len 209 jnz .Lcloop8 210 pop %rbx 211 cmp \$0,$len 212 jne .Lcloop1 213 jmp .Lexit 214___ 215$code.=<<___; 216.align 16 217.Lcloop1: 218 add $TX[0]#b,$YY#b 219 movzb ($dat,$YY),$TY#d 220 movb $TX[0]#b,($dat,$YY) 221 movb $TY#b,($dat,$XX[0]) 222 add $TX[0]#b,$TY#b 223 add \$1,$XX[0]#b 224 movzb ($dat,$TY),$TY#d 225 movzb ($dat,$XX[0]),$TX[0]#d 226 xorb ($inp),$TY#b 227 lea 1($inp),$inp 228 movb $TY#b,($out) 229 lea 1($out),$out 230 sub \$1,$len 231 jnz .Lcloop1 232 jmp .Lexit 233.size RC4,.-RC4 234___ 235 236$code =~ s/#([bwd])/$1/gm; 237 238print $code; 239 240close STDOUT; 241