sha1-s390x.pl revision 298998
10SN/A#!/usr/bin/env perl
21169Sjoehw
30SN/A# ====================================================================
40SN/A# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5623SN/A# project. The module is, however, dual licensed under OpenSSL and
6623SN/A# CRYPTOGAMS licenses depending on where you obtain it. For further
7623SN/A# details see http://www.openssl.org/~appro/cryptogams/.
8623SN/A# ====================================================================
9623SN/A
10623SN/A# SHA1 block procedure for s390x.
110SN/A
120SN/A# April 2007.
130SN/A#
140SN/A# Performance is >30% better than gcc 3.3 generated code. But the real
150SN/A# twist is that SHA1 hardware support is detected and utilized. In
160SN/A# which case performance can reach further >4.5x for larger chunks.
170SN/A
180SN/A# January 2009.
190SN/A#
200SN/A# Optimize Xupdate for amount of memory references and reschedule
210SN/A# instructions to favour dual-issue z10 pipeline. On z10 hardware is
220SN/A# "only" ~2.3x faster than software.
230SN/A
240SN/A# November 2010.
250SN/A#
260SN/A# Adapt for -m31 build. If kernel supports what's called "highgprs"
270SN/A# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
280SN/A# instructions and achieve "64-bit" performance even in 31-bit legacy
290SN/A# application context. The feature is not specific to any particular
300SN/A# processor, as long as it's "z-CPU". Latter implies that the code
310SN/A# remains z/Architecture specific.
320SN/A
330SN/A$kimdfunc=1;	# magic function code for kimd instruction
340SN/A
350SN/A$flavour = shift;
360SN/A
37623SN/Aif ($flavour =~ /3[12]/) {
38877Siris	$SIZE_T=4;
39623SN/A	$g="";
40623SN/A} else {
41623SN/A	$SIZE_T=8;
420SN/A	$g="g";
431169Sjoehw}
440SN/A
450SN/Awhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
460SN/Aopen STDOUT,">$output";
470SN/A
480SN/A$K_00_39="%r0"; $K=$K_00_39;
490SN/A$K_40_79="%r1";
500SN/A$ctx="%r2";	$prefetch="%r2";
510SN/A$inp="%r3";
520SN/A$len="%r4";
530SN/A
540SN/A$A="%r5";
550SN/A$B="%r6";
560SN/A$C="%r7";
570SN/A$D="%r8";
580SN/A$E="%r9";	@V=($A,$B,$C,$D,$E);
590SN/A$t0="%r10";
600SN/A$t1="%r11";
610SN/A@X=("%r12","%r13","%r14");
620SN/A$sp="%r15";
630SN/A
640SN/A$stdframe=16*$SIZE_T+4*8;
650SN/A$frame=$stdframe+16*4;
660SN/A
670SN/Asub Xupdate {
680SN/Amy $i=shift;
690SN/A
700SN/A$code.=<<___ if ($i==15);
710SN/A	lg	$prefetch,$stdframe($sp)	### Xupdate(16) warm-up
720SN/A	lr	$X[0],$X[2]
730SN/A___
740SN/Areturn if ($i&1);	# Xupdate is vectorized and executed every 2nd cycle
750SN/A$code.=<<___ if ($i<16);
760SN/A	lg	$X[0],`$i*4`($inp)	### Xload($i)
770SN/A	rllg	$X[1],$X[0],32
780SN/A___
790SN/A$code.=<<___ if ($i>=16);
800SN/A	xgr	$X[0],$prefetch		### Xupdate($i)
810SN/A	lg	$prefetch,`$stdframe+4*(($i+2)%16)`($sp)
820SN/A	xg	$X[0],`$stdframe+4*(($i+8)%16)`($sp)
830SN/A	xgr	$X[0],$prefetch
840SN/A	rll	$X[0],$X[0],1
850SN/A	rllg	$X[1],$X[0],32
860SN/A	rll	$X[1],$X[1],1
870SN/A	rllg	$X[0],$X[1],32
880SN/A	lr	$X[2],$X[1]		# feedback
890SN/A___
900SN/A$code.=<<___ if ($i<=70);
910SN/A	stg	$X[0],`$stdframe+4*($i%16)`($sp)
920SN/A___
930SN/Aunshift(@X,pop(@X));
940SN/A}
950SN/A
960SN/Asub BODY_00_19 {
970SN/Amy ($i,$a,$b,$c,$d,$e)=@_;
980SN/Amy $xi=$X[1];
990SN/A
1000SN/A	&Xupdate($i);
1010SN/A$code.=<<___;
1020SN/A	alr	$e,$K		### $i
1030SN/A	rll	$t1,$a,5
1040SN/A	lr	$t0,$d
1050SN/A	xr	$t0,$c
1060SN/A	alr	$e,$t1
1070SN/A	nr	$t0,$b
1080SN/A	alr	$e,$xi
1090SN/A	xr	$t0,$d
1100SN/A	rll	$b,$b,30
1110SN/A	alr	$e,$t0
1120SN/A___
1130SN/A}
1140SN/A
1150SN/Asub BODY_20_39 {
1160SN/Amy ($i,$a,$b,$c,$d,$e)=@_;
1170SN/Amy $xi=$X[1];
1180SN/A
1190SN/A	&Xupdate($i);
1200SN/A$code.=<<___;
1210SN/A	alr	$e,$K		### $i
1220SN/A	rll	$t1,$a,5
123	lr	$t0,$b
124	alr	$e,$t1
125	xr	$t0,$c
126	alr	$e,$xi
127	xr	$t0,$d
128	rll	$b,$b,30
129	alr	$e,$t0
130___
131}
132
133sub BODY_40_59 {
134my ($i,$a,$b,$c,$d,$e)=@_;
135my $xi=$X[1];
136
137	&Xupdate($i);
138$code.=<<___;
139	alr	$e,$K		### $i
140	rll	$t1,$a,5
141	lr	$t0,$b
142	alr	$e,$t1
143	or	$t0,$c
144	lr	$t1,$b
145	nr	$t0,$d
146	nr	$t1,$c
147	alr	$e,$xi
148	or	$t0,$t1
149	rll	$b,$b,30
150	alr	$e,$t0
151___
152}
153
154$code.=<<___;
155.text
156.align	64
157.type	Ktable,\@object
158Ktable: .long	0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6
159	.skip	48	#.long	0,0,0,0,0,0,0,0,0,0,0,0
160.size	Ktable,.-Ktable
161.globl	sha1_block_data_order
162.type	sha1_block_data_order,\@function
163sha1_block_data_order:
164___
165$code.=<<___ if ($kimdfunc);
166	larl	%r1,OPENSSL_s390xcap_P
167	lg	%r0,0(%r1)
168	tmhl	%r0,0x4000	# check for message-security assist
169	jz	.Lsoftware
170	lg	%r0,16(%r1)	# check kimd capabilities
171	tmhh	%r0,`0x8000>>$kimdfunc`
172	jz	.Lsoftware
173	lghi	%r0,$kimdfunc
174	lgr	%r1,$ctx
175	lgr	%r2,$inp
176	sllg	%r3,$len,6
177	.long	0xb93e0002	# kimd %r0,%r2
178	brc	1,.-4		# pay attention to "partial completion"
179	br	%r14
180.align	16
181.Lsoftware:
182___
183$code.=<<___;
184	lghi	%r1,-$frame
185	st${g}	$ctx,`2*$SIZE_T`($sp)
186	stm${g}	%r6,%r15,`6*$SIZE_T`($sp)
187	lgr	%r0,$sp
188	la	$sp,0(%r1,$sp)
189	st${g}	%r0,0($sp)
190
191	larl	$t0,Ktable
192	llgf	$A,0($ctx)
193	llgf	$B,4($ctx)
194	llgf	$C,8($ctx)
195	llgf	$D,12($ctx)
196	llgf	$E,16($ctx)
197
198	lg	$K_00_39,0($t0)
199	lg	$K_40_79,8($t0)
200
201.Lloop:
202	rllg	$K_00_39,$K_00_39,32
203___
204for ($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
205$code.=<<___;
206	rllg	$K_00_39,$K_00_39,32
207___
208for (;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
209$code.=<<___;	$K=$K_40_79;
210	rllg	$K_40_79,$K_40_79,32
211___
212for (;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
213$code.=<<___;
214	rllg	$K_40_79,$K_40_79,32
215___
216for (;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
217$code.=<<___;
218
219	l${g}	$ctx,`$frame+2*$SIZE_T`($sp)
220	la	$inp,64($inp)
221	al	$A,0($ctx)
222	al	$B,4($ctx)
223	al	$C,8($ctx)
224	al	$D,12($ctx)
225	al	$E,16($ctx)
226	st	$A,0($ctx)
227	st	$B,4($ctx)
228	st	$C,8($ctx)
229	st	$D,12($ctx)
230	st	$E,16($ctx)
231	brct${g} $len,.Lloop
232
233	lm${g}	%r6,%r15,`$frame+6*$SIZE_T`($sp)
234	br	%r14
235.size	sha1_block_data_order,.-sha1_block_data_order
236.string	"SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
237.comm	OPENSSL_s390xcap_P,80,8
238___
239
240$code =~ s/\`([^\`]*)\`/eval $1/gem;
241
242print $code;
243close STDOUT;
244