1238384Sjkim#!/usr/bin/env perl 2238384Sjkim# 3238384Sjkim# ==================================================================== 4238384Sjkim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and 6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further 7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/. 8238384Sjkim# ==================================================================== 9238384Sjkim# 10238384Sjkim# February 2009 11238384Sjkim# 12238384Sjkim# Performance is 2x of gcc 3.4.6 on z10. Coding "secret" is to 13238384Sjkim# "cluster" Address Generation Interlocks, so that one pipeline stall 14238384Sjkim# resolves several dependencies. 15238384Sjkim 16238384Sjkim# November 2010. 17238384Sjkim# 18238384Sjkim# Adapt for -m31 build. If kernel supports what's called "highgprs" 19238384Sjkim# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit 20238384Sjkim# instructions and achieve "64-bit" performance even in 31-bit legacy 21238384Sjkim# application context. The feature is not specific to any particular 22238384Sjkim# processor, as long as it's "z-CPU". Latter implies that the code 23238384Sjkim# remains z/Architecture specific. On z990 it was measured to perform 24238384Sjkim# 50% better than code generated by gcc 4.3. 25238384Sjkim 26238384Sjkim$flavour = shift; 27238384Sjkim 28238384Sjkimif ($flavour =~ /3[12]/) { 29238384Sjkim $SIZE_T=4; 30238384Sjkim $g=""; 31238384Sjkim} else { 32238384Sjkim $SIZE_T=8; 33238384Sjkim $g="g"; 34238384Sjkim} 35238384Sjkim 36238384Sjkimwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 37238384Sjkimopen STDOUT,">$output"; 38238384Sjkim 39238384Sjkim$rp="%r14"; 40238384Sjkim$sp="%r15"; 41238384Sjkim$code=<<___; 42238384Sjkim.text 43238384Sjkim 44238384Sjkim___ 45238384Sjkim 46238384Sjkim# void RC4(RC4_KEY *key,size_t len,const void *inp,void *out) 47238384Sjkim{ 48238384Sjkim$acc="%r0"; 49238384Sjkim$cnt="%r1"; 50238384Sjkim$key="%r2"; 51238384Sjkim$len="%r3"; 52238384Sjkim$inp="%r4"; 53238384Sjkim$out="%r5"; 54238384Sjkim 55238384Sjkim@XX=("%r6","%r7"); 56238384Sjkim@TX=("%r8","%r9"); 57238384Sjkim$YY="%r10"; 58238384Sjkim$TY="%r11"; 59238384Sjkim 60238384Sjkim$code.=<<___; 61238384Sjkim.globl RC4 62238384Sjkim.type RC4,\@function 63238384Sjkim.align 64 64238384SjkimRC4: 65238384Sjkim stm${g} %r6,%r11,6*$SIZE_T($sp) 66238384Sjkim___ 67238384Sjkim$code.=<<___ if ($flavour =~ /3[12]/); 68238384Sjkim llgfr $len,$len 69238384Sjkim___ 70238384Sjkim$code.=<<___; 71238384Sjkim llgc $XX[0],0($key) 72238384Sjkim llgc $YY,1($key) 73238384Sjkim la $XX[0],1($XX[0]) 74238384Sjkim nill $XX[0],0xff 75238384Sjkim srlg $cnt,$len,3 76238384Sjkim ltgr $cnt,$cnt 77238384Sjkim llgc $TX[0],2($XX[0],$key) 78238384Sjkim jz .Lshort 79238384Sjkim j .Loop8 80238384Sjkim 81238384Sjkim.align 64 82238384Sjkim.Loop8: 83238384Sjkim___ 84238384Sjkimfor ($i=0;$i<8;$i++) { 85238384Sjkim$code.=<<___; 86238384Sjkim la $YY,0($YY,$TX[0]) # $i 87238384Sjkim nill $YY,255 88238384Sjkim la $XX[1],1($XX[0]) 89238384Sjkim nill $XX[1],255 90238384Sjkim___ 91238384Sjkim$code.=<<___ if ($i==1); 92238384Sjkim llgc $acc,2($TY,$key) 93238384Sjkim___ 94238384Sjkim$code.=<<___ if ($i>1); 95238384Sjkim sllg $acc,$acc,8 96238384Sjkim ic $acc,2($TY,$key) 97238384Sjkim___ 98238384Sjkim$code.=<<___; 99238384Sjkim llgc $TY,2($YY,$key) 100238384Sjkim stc $TX[0],2($YY,$key) 101238384Sjkim llgc $TX[1],2($XX[1],$key) 102238384Sjkim stc $TY,2($XX[0],$key) 103238384Sjkim cr $XX[1],$YY 104238384Sjkim jne .Lcmov$i 105238384Sjkim la $TX[1],0($TX[0]) 106238384Sjkim.Lcmov$i: 107238384Sjkim la $TY,0($TY,$TX[0]) 108238384Sjkim nill $TY,255 109238384Sjkim___ 110238384Sjkimpush(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers 111238384Sjkim} 112238384Sjkim 113238384Sjkim$code.=<<___; 114238384Sjkim lg $TX[1],0($inp) 115238384Sjkim sllg $acc,$acc,8 116238384Sjkim la $inp,8($inp) 117238384Sjkim ic $acc,2($TY,$key) 118238384Sjkim xgr $acc,$TX[1] 119238384Sjkim stg $acc,0($out) 120238384Sjkim la $out,8($out) 121238384Sjkim brctg $cnt,.Loop8 122238384Sjkim 123238384Sjkim.Lshort: 124238384Sjkim lghi $acc,7 125238384Sjkim ngr $len,$acc 126238384Sjkim jz .Lexit 127238384Sjkim j .Loop1 128238384Sjkim 129238384Sjkim.align 16 130238384Sjkim.Loop1: 131238384Sjkim la $YY,0($YY,$TX[0]) 132238384Sjkim nill $YY,255 133238384Sjkim llgc $TY,2($YY,$key) 134238384Sjkim stc $TX[0],2($YY,$key) 135238384Sjkim stc $TY,2($XX[0],$key) 136238384Sjkim ar $TY,$TX[0] 137238384Sjkim ahi $XX[0],1 138238384Sjkim nill $TY,255 139238384Sjkim nill $XX[0],255 140238384Sjkim llgc $acc,0($inp) 141238384Sjkim la $inp,1($inp) 142238384Sjkim llgc $TY,2($TY,$key) 143238384Sjkim llgc $TX[0],2($XX[0],$key) 144238384Sjkim xr $acc,$TY 145238384Sjkim stc $acc,0($out) 146238384Sjkim la $out,1($out) 147238384Sjkim brct $len,.Loop1 148238384Sjkim 149238384Sjkim.Lexit: 150238384Sjkim ahi $XX[0],-1 151238384Sjkim stc $XX[0],0($key) 152238384Sjkim stc $YY,1($key) 153238384Sjkim lm${g} %r6,%r11,6*$SIZE_T($sp) 154238384Sjkim br $rp 155238384Sjkim.size RC4,.-RC4 156238384Sjkim.string "RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>" 157238384Sjkim 158238384Sjkim___ 159238384Sjkim} 160238384Sjkim 161238384Sjkim# void RC4_set_key(RC4_KEY *key,unsigned int len,const void *inp) 162238384Sjkim{ 163238384Sjkim$cnt="%r0"; 164238384Sjkim$idx="%r1"; 165238384Sjkim$key="%r2"; 166238384Sjkim$len="%r3"; 167238384Sjkim$inp="%r4"; 168238384Sjkim$acc="%r5"; 169238384Sjkim$dat="%r6"; 170238384Sjkim$ikey="%r7"; 171238384Sjkim$iinp="%r8"; 172238384Sjkim 173238384Sjkim$code.=<<___; 174238384Sjkim.globl private_RC4_set_key 175238384Sjkim.type private_RC4_set_key,\@function 176238384Sjkim.align 64 177238384Sjkimprivate_RC4_set_key: 178238384Sjkim stm${g} %r6,%r8,6*$SIZE_T($sp) 179238384Sjkim lhi $cnt,256 180238384Sjkim la $idx,0(%r0) 181238384Sjkim sth $idx,0($key) 182238384Sjkim.align 4 183238384Sjkim.L1stloop: 184238384Sjkim stc $idx,2($idx,$key) 185238384Sjkim la $idx,1($idx) 186238384Sjkim brct $cnt,.L1stloop 187238384Sjkim 188238384Sjkim lghi $ikey,-256 189238384Sjkim lr $cnt,$len 190238384Sjkim la $iinp,0(%r0) 191238384Sjkim la $idx,0(%r0) 192238384Sjkim.align 16 193238384Sjkim.L2ndloop: 194238384Sjkim llgc $acc,2+256($ikey,$key) 195238384Sjkim llgc $dat,0($iinp,$inp) 196238384Sjkim la $idx,0($idx,$acc) 197238384Sjkim la $ikey,1($ikey) 198238384Sjkim la $idx,0($idx,$dat) 199238384Sjkim nill $idx,255 200238384Sjkim la $iinp,1($iinp) 201238384Sjkim tml $ikey,255 202238384Sjkim llgc $dat,2($idx,$key) 203238384Sjkim stc $dat,2+256-1($ikey,$key) 204238384Sjkim stc $acc,2($idx,$key) 205238384Sjkim jz .Ldone 206238384Sjkim brct $cnt,.L2ndloop 207238384Sjkim lr $cnt,$len 208238384Sjkim la $iinp,0(%r0) 209238384Sjkim j .L2ndloop 210238384Sjkim.Ldone: 211238384Sjkim lm${g} %r6,%r8,6*$SIZE_T($sp) 212238384Sjkim br $rp 213238384Sjkim.size private_RC4_set_key,.-private_RC4_set_key 214238384Sjkim 215238384Sjkim___ 216238384Sjkim} 217238384Sjkim 218238384Sjkim# const char *RC4_options() 219238384Sjkim$code.=<<___; 220238384Sjkim.globl RC4_options 221238384Sjkim.type RC4_options,\@function 222238384Sjkim.align 16 223238384SjkimRC4_options: 224238384Sjkim larl %r2,.Loptions 225238384Sjkim br %r14 226238384Sjkim.size RC4_options,.-RC4_options 227238384Sjkim.section .rodata 228238384Sjkim.Loptions: 229238384Sjkim.align 8 230238384Sjkim.string "rc4(8x,char)" 231238384Sjkim___ 232238384Sjkim 233238384Sjkimprint $code; 234238384Sjkimclose STDOUT; # force flush 235