1238384Sjkim#!/usr/bin/env perl
2238384Sjkim#
3238384Sjkim# ====================================================================
4238384Sjkim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and
6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further
7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/.
8238384Sjkim# ====================================================================
9238384Sjkim#
10238384Sjkim# February 2009
11238384Sjkim#
12238384Sjkim# Performance is 2x of gcc 3.4.6 on z10. Coding "secret" is to
13238384Sjkim# "cluster" Address Generation Interlocks, so that one pipeline stall
14238384Sjkim# resolves several dependencies.
15238384Sjkim
16238384Sjkim# November 2010.
17238384Sjkim#
18238384Sjkim# Adapt for -m31 build. If kernel supports what's called "highgprs"
19238384Sjkim# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
20238384Sjkim# instructions and achieve "64-bit" performance even in 31-bit legacy
21238384Sjkim# application context. The feature is not specific to any particular
22238384Sjkim# processor, as long as it's "z-CPU". Latter implies that the code
23238384Sjkim# remains z/Architecture specific. On z990 it was measured to perform
24238384Sjkim# 50% better than code generated by gcc 4.3.
25238384Sjkim
26238384Sjkim$flavour = shift;
27238384Sjkim
28238384Sjkimif ($flavour =~ /3[12]/) {
29238384Sjkim	$SIZE_T=4;
30238384Sjkim	$g="";
31238384Sjkim} else {
32238384Sjkim	$SIZE_T=8;
33238384Sjkim	$g="g";
34238384Sjkim}
35238384Sjkim
36238384Sjkimwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
37238384Sjkimopen STDOUT,">$output";
38238384Sjkim
39238384Sjkim$rp="%r14";
40238384Sjkim$sp="%r15";
41238384Sjkim$code=<<___;
42238384Sjkim.text
43238384Sjkim
44238384Sjkim___
45238384Sjkim
46238384Sjkim# void RC4(RC4_KEY *key,size_t len,const void *inp,void *out)
47238384Sjkim{
48238384Sjkim$acc="%r0";
49238384Sjkim$cnt="%r1";
50238384Sjkim$key="%r2";
51238384Sjkim$len="%r3";
52238384Sjkim$inp="%r4";
53238384Sjkim$out="%r5";
54238384Sjkim
55238384Sjkim@XX=("%r6","%r7");
56238384Sjkim@TX=("%r8","%r9");
57238384Sjkim$YY="%r10";
58238384Sjkim$TY="%r11";
59238384Sjkim
60238384Sjkim$code.=<<___;
61238384Sjkim.globl	RC4
62238384Sjkim.type	RC4,\@function
63238384Sjkim.align	64
64238384SjkimRC4:
65238384Sjkim	stm${g}	%r6,%r11,6*$SIZE_T($sp)
66238384Sjkim___
67238384Sjkim$code.=<<___ if ($flavour =~ /3[12]/);
68238384Sjkim	llgfr	$len,$len
69238384Sjkim___
70238384Sjkim$code.=<<___;
71238384Sjkim	llgc	$XX[0],0($key)
72238384Sjkim	llgc	$YY,1($key)
73238384Sjkim	la	$XX[0],1($XX[0])
74238384Sjkim	nill	$XX[0],0xff
75238384Sjkim	srlg	$cnt,$len,3
76238384Sjkim	ltgr	$cnt,$cnt
77238384Sjkim	llgc	$TX[0],2($XX[0],$key)
78238384Sjkim	jz	.Lshort
79238384Sjkim	j	.Loop8
80238384Sjkim
81238384Sjkim.align	64
82238384Sjkim.Loop8:
83238384Sjkim___
84238384Sjkimfor ($i=0;$i<8;$i++) {
85238384Sjkim$code.=<<___;
86238384Sjkim	la	$YY,0($YY,$TX[0])	# $i
87238384Sjkim	nill	$YY,255
88238384Sjkim	la	$XX[1],1($XX[0])
89238384Sjkim	nill	$XX[1],255
90238384Sjkim___
91238384Sjkim$code.=<<___ if ($i==1);
92238384Sjkim	llgc	$acc,2($TY,$key)
93238384Sjkim___
94238384Sjkim$code.=<<___ if ($i>1);
95238384Sjkim	sllg	$acc,$acc,8
96238384Sjkim	ic	$acc,2($TY,$key)
97238384Sjkim___
98238384Sjkim$code.=<<___;
99238384Sjkim	llgc	$TY,2($YY,$key)
100238384Sjkim	stc	$TX[0],2($YY,$key)
101238384Sjkim	llgc	$TX[1],2($XX[1],$key)
102238384Sjkim	stc	$TY,2($XX[0],$key)
103238384Sjkim	cr	$XX[1],$YY
104238384Sjkim	jne	.Lcmov$i
105238384Sjkim	la	$TX[1],0($TX[0])
106238384Sjkim.Lcmov$i:
107238384Sjkim	la	$TY,0($TY,$TX[0])
108238384Sjkim	nill	$TY,255
109238384Sjkim___
110238384Sjkimpush(@TX,shift(@TX)); push(@XX,shift(@XX));     # "rotate" registers
111238384Sjkim}
112238384Sjkim
113238384Sjkim$code.=<<___;
114238384Sjkim	lg	$TX[1],0($inp)
115238384Sjkim	sllg	$acc,$acc,8
116238384Sjkim	la	$inp,8($inp)
117238384Sjkim	ic	$acc,2($TY,$key)
118238384Sjkim	xgr	$acc,$TX[1]
119238384Sjkim	stg	$acc,0($out)
120238384Sjkim	la	$out,8($out)
121238384Sjkim	brctg	$cnt,.Loop8
122238384Sjkim
123238384Sjkim.Lshort:
124238384Sjkim	lghi	$acc,7
125238384Sjkim	ngr	$len,$acc
126238384Sjkim	jz	.Lexit
127238384Sjkim	j	.Loop1
128238384Sjkim
129238384Sjkim.align	16
130238384Sjkim.Loop1:
131238384Sjkim	la	$YY,0($YY,$TX[0])
132238384Sjkim	nill	$YY,255
133238384Sjkim	llgc	$TY,2($YY,$key)
134238384Sjkim	stc	$TX[0],2($YY,$key)
135238384Sjkim	stc	$TY,2($XX[0],$key)
136238384Sjkim	ar	$TY,$TX[0]
137238384Sjkim	ahi	$XX[0],1
138238384Sjkim	nill	$TY,255
139238384Sjkim	nill	$XX[0],255
140238384Sjkim	llgc	$acc,0($inp)
141238384Sjkim	la	$inp,1($inp)
142238384Sjkim	llgc	$TY,2($TY,$key)
143238384Sjkim	llgc	$TX[0],2($XX[0],$key)
144238384Sjkim	xr	$acc,$TY
145238384Sjkim	stc	$acc,0($out)
146238384Sjkim	la	$out,1($out)
147238384Sjkim	brct	$len,.Loop1
148238384Sjkim
149238384Sjkim.Lexit:
150238384Sjkim	ahi	$XX[0],-1
151238384Sjkim	stc	$XX[0],0($key)
152238384Sjkim	stc	$YY,1($key)
153238384Sjkim	lm${g}	%r6,%r11,6*$SIZE_T($sp)
154238384Sjkim	br	$rp
155238384Sjkim.size	RC4,.-RC4
156238384Sjkim.string	"RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
157238384Sjkim
158238384Sjkim___
159238384Sjkim}
160238384Sjkim
161238384Sjkim# void RC4_set_key(RC4_KEY *key,unsigned int len,const void *inp)
162238384Sjkim{
163238384Sjkim$cnt="%r0";
164238384Sjkim$idx="%r1";
165238384Sjkim$key="%r2";
166238384Sjkim$len="%r3";
167238384Sjkim$inp="%r4";
168238384Sjkim$acc="%r5";
169238384Sjkim$dat="%r6";
170238384Sjkim$ikey="%r7";
171238384Sjkim$iinp="%r8";
172238384Sjkim
173238384Sjkim$code.=<<___;
174238384Sjkim.globl	private_RC4_set_key
175238384Sjkim.type	private_RC4_set_key,\@function
176238384Sjkim.align	64
177238384Sjkimprivate_RC4_set_key:
178238384Sjkim	stm${g}	%r6,%r8,6*$SIZE_T($sp)
179238384Sjkim	lhi	$cnt,256
180238384Sjkim	la	$idx,0(%r0)
181238384Sjkim	sth	$idx,0($key)
182238384Sjkim.align	4
183238384Sjkim.L1stloop:
184238384Sjkim	stc	$idx,2($idx,$key)
185238384Sjkim	la	$idx,1($idx)
186238384Sjkim	brct	$cnt,.L1stloop
187238384Sjkim
188238384Sjkim	lghi	$ikey,-256
189238384Sjkim	lr	$cnt,$len
190238384Sjkim	la	$iinp,0(%r0)
191238384Sjkim	la	$idx,0(%r0)
192238384Sjkim.align	16
193238384Sjkim.L2ndloop:
194238384Sjkim	llgc	$acc,2+256($ikey,$key)
195238384Sjkim	llgc	$dat,0($iinp,$inp)
196238384Sjkim	la	$idx,0($idx,$acc)
197238384Sjkim	la	$ikey,1($ikey)
198238384Sjkim	la	$idx,0($idx,$dat)
199238384Sjkim	nill	$idx,255
200238384Sjkim	la	$iinp,1($iinp)
201238384Sjkim	tml	$ikey,255
202238384Sjkim	llgc	$dat,2($idx,$key)
203238384Sjkim	stc	$dat,2+256-1($ikey,$key)
204238384Sjkim	stc	$acc,2($idx,$key)
205238384Sjkim	jz	.Ldone
206238384Sjkim	brct	$cnt,.L2ndloop
207238384Sjkim	lr	$cnt,$len
208238384Sjkim	la	$iinp,0(%r0)
209238384Sjkim	j	.L2ndloop
210238384Sjkim.Ldone:
211238384Sjkim	lm${g}	%r6,%r8,6*$SIZE_T($sp)
212238384Sjkim	br	$rp
213238384Sjkim.size	private_RC4_set_key,.-private_RC4_set_key
214238384Sjkim
215238384Sjkim___
216238384Sjkim}
217238384Sjkim
218238384Sjkim# const char *RC4_options()
219238384Sjkim$code.=<<___;
220238384Sjkim.globl	RC4_options
221238384Sjkim.type	RC4_options,\@function
222238384Sjkim.align	16
223238384SjkimRC4_options:
224238384Sjkim	larl	%r2,.Loptions
225238384Sjkim	br	%r14
226238384Sjkim.size	RC4_options,.-RC4_options
227238384Sjkim.section	.rodata
228238384Sjkim.Loptions:
229238384Sjkim.align	8
230238384Sjkim.string	"rc4(8x,char)"
231238384Sjkim___
232238384Sjkim
233238384Sjkimprint $code;
234238384Sjkimclose STDOUT;	# force flush
235