1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# I let hardware handle unaligned input(*), except on page boundaries
11# (see below for details). Otherwise straightforward implementation
12# with X vector in register bank.
13#
14# (*) this means that this module is inappropriate for PPC403? Does
15#     anybody know if pre-POWER3 can sustain unaligned load?
16
17# 			-m64	-m32
18# ----------------------------------
19# PPC970,gcc-4.0.0	+76%	+59%
20# Power6,xlc-7		+68%	+33%
21
22$flavour = shift;
23
24if ($flavour =~ /64/) {
25	$SIZE_T	=8;
26	$LRSAVE	=2*$SIZE_T;
27	$UCMP	="cmpld";
28	$STU	="stdu";
29	$POP	="ld";
30	$PUSH	="std";
31} elsif ($flavour =~ /32/) {
32	$SIZE_T	=4;
33	$LRSAVE	=$SIZE_T;
34	$UCMP	="cmplw";
35	$STU	="stwu";
36	$POP	="lwz";
37	$PUSH	="stw";
38} else { die "nonsense $flavour"; }
39
40# Define endianess based on flavour
41# i.e.: linux64le
42$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
43
44$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
45( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
46( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
47die "can't locate ppc-xlate.pl";
48
49open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
50
51$FRAME=24*$SIZE_T+64;
52$LOCALS=6*$SIZE_T;
53
54$K  ="r0";
55$sp ="r1";
56$toc="r2";
57$ctx="r3";
58$inp="r4";
59$num="r5";
60$t0 ="r15";
61$t1 ="r6";
62
63$A  ="r7";
64$B  ="r8";
65$C  ="r9";
66$D  ="r10";
67$E  ="r11";
68$T  ="r12";
69
70@V=($A,$B,$C,$D,$E,$T);
71@X=("r16","r17","r18","r19","r20","r21","r22","r23",
72    "r24","r25","r26","r27","r28","r29","r30","r31");
73
74sub loadbe {
75my ($dst, $src, $temp_reg) = @_;
76$code.=<<___ if (!$LITTLE_ENDIAN);
77	lwz	$dst,$src
78___
79$code.=<<___ if ($LITTLE_ENDIAN);
80	lwz	$temp_reg,$src
81	rotlwi	$dst,$temp_reg,8
82	rlwimi	$dst,$temp_reg,24,0,7
83	rlwimi	$dst,$temp_reg,24,16,23
84___
85}
86
87sub BODY_00_19 {
88my ($i,$a,$b,$c,$d,$e,$f)=@_;
89my $j=$i+1;
90
91	# Since the last value of $f is discarded, we can use
92	# it as a temp reg to swap byte-order when needed.
93	loadbe("@X[$i]","`$i*4`($inp)",$f) if ($i==0);
94	loadbe("@X[$j]","`$j*4`($inp)",$f) if ($i<15);
95$code.=<<___ if ($i<15);
96	add	$f,$K,$e
97	rotlwi	$e,$a,5
98	add	$f,$f,@X[$i]
99	and	$t0,$c,$b
100	add	$f,$f,$e
101	andc	$t1,$d,$b
102	rotlwi	$b,$b,30
103	or	$t0,$t0,$t1
104	add	$f,$f,$t0
105___
106$code.=<<___ if ($i>=15);
107	add	$f,$K,$e
108	rotlwi	$e,$a,5
109	xor	@X[$j%16],@X[$j%16],@X[($j+2)%16]
110	add	$f,$f,@X[$i%16]
111	and	$t0,$c,$b
112	xor	@X[$j%16],@X[$j%16],@X[($j+8)%16]
113	add	$f,$f,$e
114	andc	$t1,$d,$b
115	rotlwi	$b,$b,30
116	or	$t0,$t0,$t1
117	xor	@X[$j%16],@X[$j%16],@X[($j+13)%16]
118	add	$f,$f,$t0
119	rotlwi	@X[$j%16],@X[$j%16],1
120___
121}
122
123sub BODY_20_39 {
124my ($i,$a,$b,$c,$d,$e,$f)=@_;
125my $j=$i+1;
126$code.=<<___ if ($i<79);
127	add	$f,$K,$e
128	xor	$t0,$b,$d
129	rotlwi	$e,$a,5
130	xor	@X[$j%16],@X[$j%16],@X[($j+2)%16]
131	add	$f,$f,@X[$i%16]
132	xor	$t0,$t0,$c
133	xor	@X[$j%16],@X[$j%16],@X[($j+8)%16]
134	add	$f,$f,$t0
135	rotlwi	$b,$b,30
136	xor	@X[$j%16],@X[$j%16],@X[($j+13)%16]
137	add	$f,$f,$e
138	rotlwi	@X[$j%16],@X[$j%16],1
139___
140$code.=<<___ if ($i==79);
141	add	$f,$K,$e
142	xor	$t0,$b,$d
143	rotlwi	$e,$a,5
144	lwz	r16,0($ctx)
145	add	$f,$f,@X[$i%16]
146	xor	$t0,$t0,$c
147	lwz	r17,4($ctx)
148	add	$f,$f,$t0
149	rotlwi	$b,$b,30
150	lwz	r18,8($ctx)
151	lwz	r19,12($ctx)
152	add	$f,$f,$e
153	lwz	r20,16($ctx)
154___
155}
156
157sub BODY_40_59 {
158my ($i,$a,$b,$c,$d,$e,$f)=@_;
159my $j=$i+1;
160$code.=<<___;
161	add	$f,$K,$e
162	rotlwi	$e,$a,5
163	xor	@X[$j%16],@X[$j%16],@X[($j+2)%16]
164	add	$f,$f,@X[$i%16]
165	and	$t0,$b,$c
166	xor	@X[$j%16],@X[$j%16],@X[($j+8)%16]
167	add	$f,$f,$e
168	or	$t1,$b,$c
169	rotlwi	$b,$b,30
170	xor	@X[$j%16],@X[$j%16],@X[($j+13)%16]
171	and	$t1,$t1,$d
172	or	$t0,$t0,$t1
173	rotlwi	@X[$j%16],@X[$j%16],1
174	add	$f,$f,$t0
175___
176}
177
178$code=<<___;
179.machine	"any"
180.text
181
182.globl	.sha1_block_data_order
183.align	4
184.sha1_block_data_order:
185	$STU	$sp,-$FRAME($sp)
186	mflr	r0
187	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
188	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
189	$PUSH	r17,`$FRAME-$SIZE_T*15`($sp)
190	$PUSH	r18,`$FRAME-$SIZE_T*14`($sp)
191	$PUSH	r19,`$FRAME-$SIZE_T*13`($sp)
192	$PUSH	r20,`$FRAME-$SIZE_T*12`($sp)
193	$PUSH	r21,`$FRAME-$SIZE_T*11`($sp)
194	$PUSH	r22,`$FRAME-$SIZE_T*10`($sp)
195	$PUSH	r23,`$FRAME-$SIZE_T*9`($sp)
196	$PUSH	r24,`$FRAME-$SIZE_T*8`($sp)
197	$PUSH	r25,`$FRAME-$SIZE_T*7`($sp)
198	$PUSH	r26,`$FRAME-$SIZE_T*6`($sp)
199	$PUSH	r27,`$FRAME-$SIZE_T*5`($sp)
200	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
201	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
202	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
203	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
204	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
205	lwz	$A,0($ctx)
206	lwz	$B,4($ctx)
207	lwz	$C,8($ctx)
208	lwz	$D,12($ctx)
209	lwz	$E,16($ctx)
210	andi.	r0,$inp,3
211	bne	Lunaligned
212Laligned:
213	mtctr	$num
214	bl	Lsha1_block_private
215	b	Ldone
216
217; PowerPC specification allows an implementation to be ill-behaved
218; upon unaligned access which crosses page boundary. "Better safe
219; than sorry" principle makes me treat it specially. But I don't
220; look for particular offending word, but rather for 64-byte input
221; block which crosses the boundary. Once found that block is aligned
222; and hashed separately...
223.align	4
224Lunaligned:
225	subfic	$t1,$inp,4096
226	andi.	$t1,$t1,4095	; distance to closest page boundary
227	srwi.	$t1,$t1,6	; t1/=64
228	beq	Lcross_page
229	$UCMP	$num,$t1
230	ble	Laligned	; didn't cross the page boundary
231	mtctr	$t1
232	subfc	$num,$t1,$num
233	bl	Lsha1_block_private
234Lcross_page:
235	li	$t1,16
236	mtctr	$t1
237	addi	r20,$sp,$LOCALS	; spot within the frame
238Lmemcpy:
239	lbz	r16,0($inp)
240	lbz	r17,1($inp)
241	lbz	r18,2($inp)
242	lbz	r19,3($inp)
243	addi	$inp,$inp,4
244	stb	r16,0(r20)
245	stb	r17,1(r20)
246	stb	r18,2(r20)
247	stb	r19,3(r20)
248	addi	r20,r20,4
249	bdnz	Lmemcpy
250
251	$PUSH	$inp,`$FRAME-$SIZE_T*18`($sp)
252	li	$t1,1
253	addi	$inp,$sp,$LOCALS
254	mtctr	$t1
255	bl	Lsha1_block_private
256	$POP	$inp,`$FRAME-$SIZE_T*18`($sp)
257	addic.	$num,$num,-1
258	bne	Lunaligned
259
260Ldone:
261	$POP	r0,`$FRAME+$LRSAVE`($sp)
262	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
263	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
264	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
265	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
266	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
267	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
268	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
269	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
270	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
271	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
272	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
273	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
274	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
275	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
276	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
277	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
278	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
279	mtlr	r0
280	addi	$sp,$sp,$FRAME
281	blr
282	.long	0
283	.byte	0,12,4,1,0x80,18,3,0
284	.long	0
285___
286
287# This is private block function, which uses tailored calling
288# interface, namely upon entry SHA_CTX is pre-loaded to given
289# registers and counter register contains amount of chunks to
290# digest...
291$code.=<<___;
292.align	4
293Lsha1_block_private:
294___
295$code.=<<___;	# load K_00_19
296	lis	$K,0x5a82
297	ori	$K,$K,0x7999
298___
299for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
300$code.=<<___;	# load K_20_39
301	lis	$K,0x6ed9
302	ori	$K,$K,0xeba1
303___
304for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
305$code.=<<___;	# load K_40_59
306	lis	$K,0x8f1b
307	ori	$K,$K,0xbcdc
308___
309for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
310$code.=<<___;	# load K_60_79
311	lis	$K,0xca62
312	ori	$K,$K,0xc1d6
313___
314for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
315$code.=<<___;
316	add	r16,r16,$E
317	add	r17,r17,$T
318	add	r18,r18,$A
319	add	r19,r19,$B
320	add	r20,r20,$C
321	stw	r16,0($ctx)
322	mr	$A,r16
323	stw	r17,4($ctx)
324	mr	$B,r17
325	stw	r18,8($ctx)
326	mr	$C,r18
327	stw	r19,12($ctx)
328	mr	$D,r19
329	stw	r20,16($ctx)
330	mr	$E,r20
331	addi	$inp,$inp,`16*4`
332	bdnz	Lsha1_block_private
333	blr
334	.long	0
335	.byte	0,12,0x14,0,0,0,0,0
336.size	.sha1_block_data_order,.-.sha1_block_data_order
337___
338$code.=<<___;
339.asciz	"SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
340___
341
342$code =~ s/\`([^\`]*)\`/eval $1/gem;
343print $code;
344close STDOUT;
345