1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256/512 block procedure for PA-RISC.
11
12# June 2009.
13#
14# SHA256 performance is >75% better than gcc 3.2 generated code on
15# PA-7100LC. Compared to code generated by vendor compiler this
16# implementation is almost 70% faster in 64-bit build, but delivers
17# virtually same performance in 32-bit build on PA-8600.
18#
19# SHA512 performance is >2.9x better than gcc 3.2 generated code on
20# PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the
21# code is executed on PA-RISC 2.0 processor and switches to 64-bit
22# code path delivering adequate performance even in "blended" 32-bit
23# build. Though 64-bit code is not any faster than code generated by
24# vendor compiler on PA-8600...
25#
26# Special thanks to polarhome.com for providing HP-UX account.
27
28$flavour = shift;
29$output = shift;
30open STDOUT,">$output";
31
32if ($flavour =~ /64/) {
33	$LEVEL		="2.0W";
34	$SIZE_T		=8;
35	$FRAME_MARKER	=80;
36	$SAVED_RP	=16;
37	$PUSH		="std";
38	$PUSHMA		="std,ma";
39	$POP		="ldd";
40	$POPMB		="ldd,mb";
41} else {
42	$LEVEL		="1.0";
43	$SIZE_T		=4;
44	$FRAME_MARKER	=48;
45	$SAVED_RP	=20;
46	$PUSH		="stw";
47	$PUSHMA		="stwm";
48	$POP		="ldw";
49	$POPMB		="ldwm";
50}
51
52if ($output =~ /512/) {
53	$func="sha512_block_data_order";
54	$SZ=8;
55	@Sigma0=(28,34,39);
56	@Sigma1=(14,18,41);
57	@sigma0=(1,  8, 7);
58	@sigma1=(19,61, 6);
59	$rounds=80;
60	$LAST10BITS=0x017;
61	$LD="ldd";
62	$LDM="ldd,ma";
63	$ST="std";
64} else {
65	$func="sha256_block_data_order";
66	$SZ=4;
67	@Sigma0=( 2,13,22);
68	@Sigma1=( 6,11,25);
69	@sigma0=( 7,18, 3);
70	@sigma1=(17,19,10);
71	$rounds=64;
72	$LAST10BITS=0x0f2;
73	$LD="ldw";
74	$LDM="ldwm";
75	$ST="stw";
76}
77
78$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
79				#                 [+ argument transfer]
80$XOFF=16*$SZ+32;		# local variables
81$FRAME+=$XOFF;
82$XOFF+=$FRAME_MARKER;		# distance between %sp and local variables
83
84$ctx="%r26";	# zapped by $a0
85$inp="%r25";	# zapped by $a1
86$num="%r24";	# zapped by $t0
87
88$a0 ="%r26";
89$a1 ="%r25";
90$t0 ="%r24";
91$t1 ="%r29";
92$Tbl="%r31";
93
94@V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28");
95
96@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
97    "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp);
98
99sub ROUND_00_15 {
100my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
101$code.=<<___;
102	_ror	$e,$Sigma1[0],$a0
103	and	$f,$e,$t0
104	_ror	$e,$Sigma1[1],$a1
105	addl	$t1,$h,$h
106	andcm	$g,$e,$t1
107	xor	$a1,$a0,$a0
108	_ror	$a1,`$Sigma1[2]-$Sigma1[1]`,$a1
109	or	$t0,$t1,$t1		; Ch(e,f,g)
110	addl	@X[$i%16],$h,$h
111	xor	$a0,$a1,$a1		; Sigma1(e)
112	addl	$t1,$h,$h
113	_ror	$a,$Sigma0[0],$a0
114	addl	$a1,$h,$h
115
116	_ror	$a,$Sigma0[1],$a1
117	and	$a,$b,$t0
118	and	$a,$c,$t1
119	xor	$a1,$a0,$a0
120	_ror	$a1,`$Sigma0[2]-$Sigma0[1]`,$a1
121	xor	$t1,$t0,$t0
122	and	$b,$c,$t1
123	xor	$a0,$a1,$a1		; Sigma0(a)
124	addl	$h,$d,$d
125	xor	$t1,$t0,$t0		; Maj(a,b,c)
126	`"$LDM	$SZ($Tbl),$t1" if ($i<15)`
127	addl	$a1,$h,$h
128	addl	$t0,$h,$h
129
130___
131}
132
133sub ROUND_16_xx {
134my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
135$i-=16;
136$code.=<<___;
137	_ror	@X[($i+1)%16],$sigma0[0],$a0
138	_ror	@X[($i+1)%16],$sigma0[1],$a1
139	addl	@X[($i+9)%16],@X[$i],@X[$i]
140	_ror	@X[($i+14)%16],$sigma1[0],$t0
141	_ror	@X[($i+14)%16],$sigma1[1],$t1
142	xor	$a1,$a0,$a0
143	_shr	@X[($i+1)%16],$sigma0[2],$a1
144	xor	$t1,$t0,$t0
145	_shr	@X[($i+14)%16],$sigma1[2],$t1
146	xor	$a1,$a0,$a0		; sigma0(X[(i+1)&0x0f])
147	xor	$t1,$t0,$t0		; sigma1(X[(i+14)&0x0f])
148	$LDM	$SZ($Tbl),$t1
149	addl	$a0,@X[$i],@X[$i]
150	addl	$t0,@X[$i],@X[$i]
151___
152$code.=<<___ if ($i==15);
153	extru	$t1,31,10,$a1
154	comiclr,<> $LAST10BITS,$a1,%r0
155	ldo	1($Tbl),$Tbl		; signal end of $Tbl
156___
157&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
158}
159
160$code=<<___;
161	.LEVEL	$LEVEL
162	.text
163
164	.section .rodata
165	.ALIGN	64
166L\$table
167___
168$code.=<<___ if ($SZ==8);
169	.WORD	0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
170	.WORD	0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
171	.WORD	0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
172	.WORD	0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
173	.WORD	0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
174	.WORD	0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
175	.WORD	0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
176	.WORD	0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
177	.WORD	0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
178	.WORD	0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
179	.WORD	0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
180	.WORD	0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
181	.WORD	0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
182	.WORD	0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
183	.WORD	0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
184	.WORD	0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
185	.WORD	0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
186	.WORD	0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
187	.WORD	0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
188	.WORD	0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
189	.WORD	0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
190	.WORD	0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
191	.WORD	0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
192	.WORD	0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
193	.WORD	0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
194	.WORD	0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
195	.WORD	0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
196	.WORD	0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
197	.WORD	0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
198	.WORD	0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
199	.WORD	0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
200	.WORD	0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
201	.WORD	0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
202	.WORD	0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
203	.WORD	0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
204	.WORD	0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
205	.WORD	0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
206	.WORD	0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
207	.WORD	0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
208	.WORD	0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
209___
210$code.=<<___ if ($SZ==4);
211	.WORD	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
212	.WORD	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
213	.WORD	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
214	.WORD	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
215	.WORD	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
216	.WORD	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
217	.WORD	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
218	.WORD	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
219	.WORD	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
220	.WORD	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
221	.WORD	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
222	.WORD	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
223	.WORD	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
224	.WORD	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
225	.WORD	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
226	.WORD	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
227___
228$code.=<<___;
229	.previous
230
231	.EXPORT	$func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
232	.ALIGN	64
233$func
234	.PROC
235	.CALLINFO	FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
236	.ENTRY
237	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
238	$PUSHMA	%r3,$FRAME(%sp)
239	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
240	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
241	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
242	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
243	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
244	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
245	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
246	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
247	$PUSH	%r12,`-$FRAME+9*$SIZE_T`(%sp)
248	$PUSH	%r13,`-$FRAME+10*$SIZE_T`(%sp)
249	$PUSH	%r14,`-$FRAME+11*$SIZE_T`(%sp)
250	$PUSH	%r15,`-$FRAME+12*$SIZE_T`(%sp)
251	$PUSH	%r16,`-$FRAME+13*$SIZE_T`(%sp)
252	$PUSH	%r17,`-$FRAME+14*$SIZE_T`(%sp)
253	$PUSH	%r18,`-$FRAME+15*$SIZE_T`(%sp)
254
255	_shl	$num,`log(16*$SZ)/log(2)`,$num
256	addl	$inp,$num,$num		; $num to point at the end of $inp
257
258	$PUSH	$num,`-$FRAME_MARKER-4*$SIZE_T`(%sp)	; save arguments
259	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)
260	$PUSH	$ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp)
261
262#ifdef __PIC__
263	addil	LT'L\$table, %r19
264	ldw	RT'L\$table(%r1), $Tbl
265#else
266	ldil	L'L\$table, %t1
267	ldo	R'L\$table(%t1), $Tbl
268#endif
269___
270$code.=<<___ if ($SZ==8 && $SIZE_T==4);
271#ifndef __OpenBSD__
272___
273$code.=<<___ if ($SZ==8 && $SIZE_T==4);
274	ldi	31,$t1
275	mtctl	$t1,%cr11
276	extrd,u,*= $t1,%sar,1,$t1	; executes on PA-RISC 1.0
277	b	L\$parisc1
278	nop
279___
280$code.=<<___;
281	$LD	`0*$SZ`($ctx),$A	; load context
282	$LD	`1*$SZ`($ctx),$B
283	$LD	`2*$SZ`($ctx),$C
284	$LD	`3*$SZ`($ctx),$D
285	$LD	`4*$SZ`($ctx),$E
286	$LD	`5*$SZ`($ctx),$F
287	$LD	`6*$SZ`($ctx),$G
288	$LD	`7*$SZ`($ctx),$H
289
290	extru	$inp,31,`log($SZ)/log(2)`,$t0
291	sh3addl	$t0,%r0,$t0
292	subi	`8*$SZ`,$t0,$t0
293	mtctl	$t0,%cr11		; load %sar with align factor
294
295L\$oop
296	ldi	`$SZ-1`,$t0
297	$LDM	$SZ($Tbl),$t1
298	andcm	$inp,$t0,$t0		; align $inp
299___
300	for ($i=0;$i<15;$i++) {		# load input block
301	$code.="\t$LD	`$SZ*$i`($t0),@X[$i]\n";		}
302$code.=<<___;
303	cmpb,*=	$inp,$t0,L\$aligned
304	$LD	`$SZ*15`($t0),@X[15]
305	$LD	`$SZ*16`($t0),@X[16]
306___
307	for ($i=0;$i<16;$i++) {		# align data
308	$code.="\t_align	@X[$i],@X[$i+1],@X[$i]\n";	}
309$code.=<<___;
310L\$aligned
311	nop	; otherwise /usr/ccs/bin/as is confused by below .WORD
312___
313
314for($i=0;$i<16;$i++)	{ &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
315$code.=<<___;
316L\$rounds
317	nop	; otherwise /usr/ccs/bin/as is confused by below .WORD
318___
319for(;$i<32;$i++)	{ &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); }
320$code.=<<___;
321	bb,>=	$Tbl,31,L\$rounds	; end of $Tbl signalled?
322	nop
323
324	$POP	`-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx	; restore arguments
325	$POP	`-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
326	$POP	`-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
327	ldo	`-$rounds*$SZ-1`($Tbl),$Tbl		; rewind $Tbl
328
329	$LD	`0*$SZ`($ctx),@X[0]	; load context
330	$LD	`1*$SZ`($ctx),@X[1]
331	$LD	`2*$SZ`($ctx),@X[2]
332	$LD	`3*$SZ`($ctx),@X[3]
333	$LD	`4*$SZ`($ctx),@X[4]
334	$LD	`5*$SZ`($ctx),@X[5]
335	addl	@X[0],$A,$A
336	$LD	`6*$SZ`($ctx),@X[6]
337	addl	@X[1],$B,$B
338	$LD	`7*$SZ`($ctx),@X[7]
339	ldo	`16*$SZ`($inp),$inp	; advance $inp
340
341	$ST	$A,`0*$SZ`($ctx)	; save context
342	addl	@X[2],$C,$C
343	$ST	$B,`1*$SZ`($ctx)
344	addl	@X[3],$D,$D
345	$ST	$C,`2*$SZ`($ctx)
346	addl	@X[4],$E,$E
347	$ST	$D,`3*$SZ`($ctx)
348	addl	@X[5],$F,$F
349	$ST	$E,`4*$SZ`($ctx)
350	addl	@X[6],$G,$G
351	$ST	$F,`5*$SZ`($ctx)
352	addl	@X[7],$H,$H
353	$ST	$G,`6*$SZ`($ctx)
354	$ST	$H,`7*$SZ`($ctx)
355
356	cmpb,*<>,n $inp,$num,L\$oop
357	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)	; save $inp
358___
359if ($SZ==8 && $SIZE_T==4)	# SHA512 for 32-bit PA-RISC 1.0
360{{
361$code.=<<___;
362	b	L\$done
363	nop
364
365	.ALIGN	64
366L\$parisc1
367___
368$code.=<<___ if ($SZ==8 && $SIZE_T==4);
369#endif
370___
371
372@V=(  $Ahi,  $Alo,  $Bhi,  $Blo,  $Chi,  $Clo,  $Dhi,  $Dlo,
373      $Ehi,  $Elo,  $Fhi,  $Flo,  $Ghi,  $Glo,  $Hhi,  $Hlo) =
374   ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
375     "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
376$a0 ="%r17";
377$a1 ="%r18";
378$a2 ="%r19";
379$a3 ="%r20";
380$t0 ="%r21";
381$t1 ="%r22";
382$t2 ="%r28";
383$t3 ="%r29";
384$Tbl="%r31";
385
386@X=("%r23","%r24","%r25","%r26");	# zaps $num,$inp,$ctx
387
388sub ROUND_00_15_pa1 {
389my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
390       $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_;
391my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
392
393$code.=<<___ if (!$flag);
394	ldw	`-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
395	ldw	`-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo	; load X[i+1]
396___
397$code.=<<___;
398	shd	$ehi,$elo,$Sigma1[0],$t0
399	 add	$Xlo,$hlo,$hlo
400	shd	$elo,$ehi,$Sigma1[0],$t1
401	 addc	$Xhi,$hhi,$hhi		; h += X[i]
402	shd	$ehi,$elo,$Sigma1[1],$t2
403	 ldwm	8($Tbl),$Xhi
404	shd	$elo,$ehi,$Sigma1[1],$t3
405	 ldw	-4($Tbl),$Xlo		; load K[i]
406	xor	$t2,$t0,$t0
407	xor	$t3,$t1,$t1
408	 and	$flo,$elo,$a0
409	 and	$fhi,$ehi,$a1
410	shd	$ehi,$elo,$Sigma1[2],$t2
411	 andcm	$glo,$elo,$a2
412	shd	$elo,$ehi,$Sigma1[2],$t3
413	 andcm	$ghi,$ehi,$a3
414	xor	$t2,$t0,$t0
415	xor	$t3,$t1,$t1		; Sigma1(e)
416	add	$Xlo,$hlo,$hlo
417	 xor	$a2,$a0,$a0
418	addc	$Xhi,$hhi,$hhi		; h += K[i]
419	 xor	$a3,$a1,$a1		; Ch(e,f,g)
420
421	 add	$t0,$hlo,$hlo
422	shd	$ahi,$alo,$Sigma0[0],$t0
423	 addc	$t1,$hhi,$hhi		; h += Sigma1(e)
424	shd	$alo,$ahi,$Sigma0[0],$t1
425	 add	$a0,$hlo,$hlo
426	shd	$ahi,$alo,$Sigma0[1],$t2
427	 addc	$a1,$hhi,$hhi		; h += Ch(e,f,g)
428	shd	$alo,$ahi,$Sigma0[1],$t3
429
430	xor	$t2,$t0,$t0
431	xor	$t3,$t1,$t1
432	shd	$ahi,$alo,$Sigma0[2],$t2
433	and	$alo,$blo,$a0
434	shd	$alo,$ahi,$Sigma0[2],$t3
435	and	$ahi,$bhi,$a1
436	xor	$t2,$t0,$t0
437	xor	$t3,$t1,$t1		; Sigma0(a)
438
439	and	$alo,$clo,$a2
440	and	$ahi,$chi,$a3
441	xor	$a2,$a0,$a0
442	 add	$hlo,$dlo,$dlo
443	xor	$a3,$a1,$a1
444	 addc	$hhi,$dhi,$dhi		; d += h
445	and	$blo,$clo,$a2
446	 add	$t0,$hlo,$hlo
447	and	$bhi,$chi,$a3
448	 addc	$t1,$hhi,$hhi		; h += Sigma0(a)
449	xor	$a2,$a0,$a0
450	 add	$a0,$hlo,$hlo
451	xor	$a3,$a1,$a1		; Maj(a,b,c)
452	 addc	$a1,$hhi,$hhi		; h += Maj(a,b,c)
453
454___
455$code.=<<___ if ($i==15 && $flag);
456	extru	$Xlo,31,10,$Xlo
457	comiclr,= $LAST10BITS,$Xlo,%r0
458	b	L\$rounds_pa1
459	nop
460___
461push(@X,shift(@X)); push(@X,shift(@X));
462}
463
464sub ROUND_16_xx_pa1 {
465my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
466my ($i)=shift;
467$i-=16;
468$code.=<<___;
469	ldw	`-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
470	ldw	`-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo	; load X[i+1]
471	ldw	`-$XOFF+8*(($i+9)%16)`(%sp),$a1
472	ldw	`-$XOFF+8*(($i+9)%16)+4`(%sp),$a0	; load X[i+9]
473	ldw	`-$XOFF+8*(($i+14)%16)`(%sp),$a3
474	ldw	`-$XOFF+8*(($i+14)%16)+4`(%sp),$a2	; load X[i+14]
475	shd	$Xnhi,$Xnlo,$sigma0[0],$t0
476	shd	$Xnlo,$Xnhi,$sigma0[0],$t1
477	 add	$a0,$Xlo,$Xlo
478	shd	$Xnhi,$Xnlo,$sigma0[1],$t2
479	 addc	$a1,$Xhi,$Xhi
480	shd	$Xnlo,$Xnhi,$sigma0[1],$t3
481	xor	$t2,$t0,$t0
482	shd	$Xnhi,$Xnlo,$sigma0[2],$t2
483	xor	$t3,$t1,$t1
484	extru	$Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3
485	xor	$t2,$t0,$t0
486	 shd	$a3,$a2,$sigma1[0],$a0
487	xor	$t3,$t1,$t1		; sigma0(X[i+1)&0x0f])
488	 shd	$a2,$a3,$sigma1[0],$a1
489	add	$t0,$Xlo,$Xlo
490	 shd	$a3,$a2,$sigma1[1],$t2
491	addc	$t1,$Xhi,$Xhi
492	 shd	$a2,$a3,$sigma1[1],$t3
493	xor	$t2,$a0,$a0
494	shd	$a3,$a2,$sigma1[2],$t2
495	xor	$t3,$a1,$a1
496	extru	$a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3
497	xor	$t2,$a0,$a0
498	xor	$t3,$a1,$a1		; sigma0(X[i+14)&0x0f])
499	add	$a0,$Xlo,$Xlo
500	addc	$a1,$Xhi,$Xhi
501
502	stw	$Xhi,`-$XOFF+8*($i%16)`(%sp)
503	stw	$Xlo,`-$XOFF+8*($i%16)+4`(%sp)
504___
505&ROUND_00_15_pa1($i,@_,1);
506}
507$code.=<<___;
508	ldw	`0*4`($ctx),$Ahi		; load context
509	ldw	`1*4`($ctx),$Alo
510	ldw	`2*4`($ctx),$Bhi
511	ldw	`3*4`($ctx),$Blo
512	ldw	`4*4`($ctx),$Chi
513	ldw	`5*4`($ctx),$Clo
514	ldw	`6*4`($ctx),$Dhi
515	ldw	`7*4`($ctx),$Dlo
516	ldw	`8*4`($ctx),$Ehi
517	ldw	`9*4`($ctx),$Elo
518	ldw	`10*4`($ctx),$Fhi
519	ldw	`11*4`($ctx),$Flo
520	ldw	`12*4`($ctx),$Ghi
521	ldw	`13*4`($ctx),$Glo
522	ldw	`14*4`($ctx),$Hhi
523	ldw	`15*4`($ctx),$Hlo
524
525	extru	$inp,31,2,$t0
526	sh3addl	$t0,%r0,$t0
527	subi	32,$t0,$t0
528	mtctl	$t0,%cr11		; load %sar with align factor
529
530L\$oop_pa1
531	extru	$inp,31,2,$a3
532	comib,=	0,$a3,L\$aligned_pa1
533	sub	$inp,$a3,$inp
534
535	ldw	`0*4`($inp),$X[0]
536	ldw	`1*4`($inp),$X[1]
537	ldw	`2*4`($inp),$t2
538	ldw	`3*4`($inp),$t3
539	ldw	`4*4`($inp),$a0
540	ldw	`5*4`($inp),$a1
541	ldw	`6*4`($inp),$a2
542	ldw	`7*4`($inp),$a3
543	vshd	$X[0],$X[1],$X[0]
544	vshd	$X[1],$t2,$X[1]
545	stw	$X[0],`-$XOFF+0*4`(%sp)
546	ldw	`8*4`($inp),$t0
547	vshd	$t2,$t3,$t2
548	stw	$X[1],`-$XOFF+1*4`(%sp)
549	ldw	`9*4`($inp),$t1
550	vshd	$t3,$a0,$t3
551___
552{
553my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
554for ($i=2;$i<=(128/4-8);$i++) {
555$code.=<<___;
556	stw	$t[0],`-$XOFF+$i*4`(%sp)
557	ldw	`(8+$i)*4`($inp),$t[0]
558	vshd	$t[1],$t[2],$t[1]
559___
560push(@t,shift(@t));
561}
562for (;$i<(128/4-1);$i++) {
563$code.=<<___;
564	stw	$t[0],`-$XOFF+$i*4`(%sp)
565	vshd	$t[1],$t[2],$t[1]
566___
567push(@t,shift(@t));
568}
569$code.=<<___;
570	b	L\$collected_pa1
571	stw	$t[0],`-$XOFF+$i*4`(%sp)
572
573___
574}
575$code.=<<___;
576L\$aligned_pa1
577	ldw	`0*4`($inp),$X[0]
578	ldw	`1*4`($inp),$X[1]
579	ldw	`2*4`($inp),$t2
580	ldw	`3*4`($inp),$t3
581	ldw	`4*4`($inp),$a0
582	ldw	`5*4`($inp),$a1
583	ldw	`6*4`($inp),$a2
584	ldw	`7*4`($inp),$a3
585	stw	$X[0],`-$XOFF+0*4`(%sp)
586	ldw	`8*4`($inp),$t0
587	stw	$X[1],`-$XOFF+1*4`(%sp)
588	ldw	`9*4`($inp),$t1
589___
590{
591my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
592for ($i=2;$i<(128/4-8);$i++) {
593$code.=<<___;
594	stw	$t[0],`-$XOFF+$i*4`(%sp)
595	ldw	`(8+$i)*4`($inp),$t[0]
596___
597push(@t,shift(@t));
598}
599for (;$i<128/4;$i++) {
600$code.=<<___;
601	stw	$t[0],`-$XOFF+$i*4`(%sp)
602___
603push(@t,shift(@t));
604}
605$code.="L\$collected_pa1\n";
606}
607
608for($i=0;$i<16;$i++)	{ &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
609$code.="L\$rounds_pa1\n";
610for(;$i<32;$i++)	{ &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
611
612$code.=<<___;
613	$POP	`-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx	; restore arguments
614	$POP	`-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
615	$POP	`-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
616	ldo	`-$rounds*$SZ`($Tbl),$Tbl		; rewind $Tbl
617
618	ldw	`0*4`($ctx),$t1		; update context
619	ldw	`1*4`($ctx),$t0
620	ldw	`2*4`($ctx),$t3
621	ldw	`3*4`($ctx),$t2
622	ldw	`4*4`($ctx),$a1
623	ldw	`5*4`($ctx),$a0
624	ldw	`6*4`($ctx),$a3
625	add	$t0,$Alo,$Alo
626	ldw	`7*4`($ctx),$a2
627	addc	$t1,$Ahi,$Ahi
628	ldw	`8*4`($ctx),$t1
629	add	$t2,$Blo,$Blo
630	ldw	`9*4`($ctx),$t0
631	addc	$t3,$Bhi,$Bhi
632	ldw	`10*4`($ctx),$t3
633	add	$a0,$Clo,$Clo
634	ldw	`11*4`($ctx),$t2
635	addc	$a1,$Chi,$Chi
636	ldw	`12*4`($ctx),$a1
637	add	$a2,$Dlo,$Dlo
638	ldw	`13*4`($ctx),$a0
639	addc	$a3,$Dhi,$Dhi
640	ldw	`14*4`($ctx),$a3
641	add	$t0,$Elo,$Elo
642	ldw	`15*4`($ctx),$a2
643	addc	$t1,$Ehi,$Ehi
644	stw	$Ahi,`0*4`($ctx)
645	add	$t2,$Flo,$Flo
646	stw	$Alo,`1*4`($ctx)
647	addc	$t3,$Fhi,$Fhi
648	stw	$Bhi,`2*4`($ctx)
649	add	$a0,$Glo,$Glo
650	stw	$Blo,`3*4`($ctx)
651	addc	$a1,$Ghi,$Ghi
652	stw	$Chi,`4*4`($ctx)
653	add	$a2,$Hlo,$Hlo
654	stw	$Clo,`5*4`($ctx)
655	addc	$a3,$Hhi,$Hhi
656	stw	$Dhi,`6*4`($ctx)
657	ldo	`16*$SZ`($inp),$inp	; advance $inp
658	stw	$Dlo,`7*4`($ctx)
659	stw	$Ehi,`8*4`($ctx)
660	stw	$Elo,`9*4`($ctx)
661	stw	$Fhi,`10*4`($ctx)
662	stw	$Flo,`11*4`($ctx)
663	stw	$Ghi,`12*4`($ctx)
664	stw	$Glo,`13*4`($ctx)
665	stw	$Hhi,`14*4`($ctx)
666	comb,=	$inp,$num,L\$done
667	stw	$Hlo,`15*4`($ctx)
668	b	L\$oop_pa1
669	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)	; save $inp
670L\$done
671___
672}}
673$code.=<<___;
674	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
675	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
676	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
677	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
678	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
679	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
680	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
681	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
682	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
683	$POP	`-$FRAME+9*$SIZE_T`(%sp),%r12
684	$POP	`-$FRAME+10*$SIZE_T`(%sp),%r13
685	$POP	`-$FRAME+11*$SIZE_T`(%sp),%r14
686	$POP	`-$FRAME+12*$SIZE_T`(%sp),%r15
687	$POP	`-$FRAME+13*$SIZE_T`(%sp),%r16
688	$POP	`-$FRAME+14*$SIZE_T`(%sp),%r17
689	$POP	`-$FRAME+15*$SIZE_T`(%sp),%r18
690	bv	(%r2)
691	.EXIT
692	$POPMB	-$FRAME(%sp),%r3
693	.PROCEND
694___
695
696# Explicitly encode PA-RISC 2.0 instructions used in this module, so
697# that it can be compiled with .LEVEL 1.0. It should be noted that I
698# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
699# directive...
700
701my $ldd = sub {
702  my ($mod,$args) = @_;
703  my $orig = "ldd$mod\t$args";
704
705    if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices
706    {	my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1);
707	$opcode|=(1<<3) if ($mod =~ /^,m/);
708	$opcode|=(1<<2) if ($mod =~ /^,mb/);
709	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
710    }
711    else { "\t".$orig; }
712};
713
714my $std = sub {
715  my ($mod,$args) = @_;
716  my $orig = "std$mod\t$args";
717
718    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
719    {	my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
720	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
721    }
722    else { "\t".$orig; }
723};
724
725my $extrd = sub {
726  my ($mod,$args) = @_;
727  my $orig = "extrd$mod\t$args";
728
729    # I only have ",u" completer, it's implicitly encoded...
730    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
731    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
732	my $len=32-$3;
733	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
734	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
735	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
736    }
737    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
738    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
739	my $len=32-$2;
740	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
741	$opcode |= (1<<13) if ($mod =~ /,\**=/);
742	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
743    }
744    else { "\t".$orig; }
745};
746
747my $shrpd = sub {
748  my ($mod,$args) = @_;
749  my $orig = "shrpd$mod\t$args";
750
751    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
752    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
753	my $cpos=63-$3;
754	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
755	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
756    }
757    elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)	# format 11
758    {	sprintf "\t.WORD\t0x%08x\t; %s",
759		(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
760    }
761    else { "\t".$orig; }
762};
763
764sub assemble {
765  my ($mnemonic,$mod,$args)=@_;
766  my $opcode = eval("\$$mnemonic");
767
768    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
769}
770
771foreach (split("\n",$code)) {
772	s/\`([^\`]*)\`/eval $1/ge;
773
774	s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/
775		$3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32)	# rotation for >=32
776		:       sprintf("shd\t%$1,%$2,%d",$3)/e			or
777	# translate made up instructons: _ror, _shr, _align, _shl
778	s/_ror(\s+)(%r[0-9]+),/
779		($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e			or
780
781	s/_shr(\s+%r[0-9]+),([0-9]+),/
782		$SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2)
783		:        sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e	or
784
785	s/_align(\s+%r[0-9]+,%r[0-9]+),/
786		($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e		or
787
788	s/_shl(\s+%r[0-9]+),([0-9]+),/
789		$SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2)
790		:            sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e;
791
792	s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4);
793
794	s/cmpb,\*/comb,/ if ($SIZE_T==4);
795
796	s/\bbv\b/bve/    if ($SIZE_T==8);
797
798	print $_,"\n";
799}
800
801close STDOUT;
802