ghash-parisc.pl revision 279264
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# April 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
15# it processes one byte in 19.6 cycles, which is more than twice as
16# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
17# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
18# processed byte. This is ~2.2x faster than 64-bit code generated by
19# vendor compiler (which used to be very hard to beat:-).
20#
21# Special thanks to polarhome.com for providing HP-UX account.
22
23$flavour = shift;
24$output = shift;
25open STDOUT,">$output";
26
27if ($flavour =~ /64/) {
28	$LEVEL		="2.0W";
29	$SIZE_T		=8;
30	$FRAME_MARKER	=80;
31	$SAVED_RP	=16;
32	$PUSH		="std";
33	$PUSHMA		="std,ma";
34	$POP		="ldd";
35	$POPMB		="ldd,mb";
36	$NREGS		=6;
37} else {
38	$LEVEL		="1.0";	#"\n\t.ALLOW\t2.0";
39	$SIZE_T		=4;
40	$FRAME_MARKER	=48;
41	$SAVED_RP	=20;
42	$PUSH		="stw";
43	$PUSHMA		="stwm";
44	$POP		="ldw";
45	$POPMB		="ldwm";
46	$NREGS		=11;
47}
48
49$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
50				#                 [+ argument transfer]
51
52################# volatile registers
53$Xi="%r26";	# argument block
54$Htbl="%r25";
55$inp="%r24";
56$len="%r23";
57$Hhh=$Htbl;	# variables
58$Hll="%r22";
59$Zhh="%r21";
60$Zll="%r20";
61$cnt="%r19";
62$rem_4bit="%r28";
63$rem="%r29";
64$mask0xf0="%r31";
65
66################# preserved registers
67$Thh="%r1";
68$Tll="%r2";
69$nlo="%r3";
70$nhi="%r4";
71$byte="%r5";
72if ($SIZE_T==4) {
73	$Zhl="%r6";
74	$Zlh="%r7";
75	$Hhl="%r8";
76	$Hlh="%r9";
77	$Thl="%r10";
78	$Tlh="%r11";
79}
80$rem2="%r6";	# used in PA-RISC 2.0 code
81
82$code.=<<___;
83	.LEVEL	$LEVEL
84	.SPACE	\$TEXT\$
85	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
86
87	.EXPORT	gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
88	.ALIGN	64
89gcm_gmult_4bit
90	.PROC
91	.CALLINFO	FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
92	.ENTRY
93	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
94	$PUSHMA	%r3,$FRAME(%sp)
95	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
96	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
97	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
98___
99$code.=<<___ if ($SIZE_T==4);
100	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
101	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
102	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
103	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
104	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
105___
106$code.=<<___;
107	blr	%r0,$rem_4bit
108	ldi	3,$rem
109L\$pic_gmult
110	andcm	$rem_4bit,$rem,$rem_4bit
111	addl	$inp,$len,$len
112	ldo	L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
113	ldi	0xf0,$mask0xf0
114___
115$code.=<<___ if ($SIZE_T==4);
116	ldi	31,$rem
117	mtctl	$rem,%cr11
118	extrd,u,*= $rem,%sar,1,$rem	; executes on PA-RISC 1.0
119	b	L\$parisc1_gmult
120	nop
121___
122
123$code.=<<___;
124	ldb	15($Xi),$nlo
125	ldo	8($Htbl),$Hll
126
127	and	$mask0xf0,$nlo,$nhi
128	depd,z	$nlo,59,4,$nlo
129
130	ldd	$nlo($Hll),$Zll
131	ldd	$nlo($Hhh),$Zhh
132
133	depd,z	$Zll,60,4,$rem
134	shrpd	$Zhh,$Zll,4,$Zll
135	extrd,u	$Zhh,59,60,$Zhh
136	ldb	14($Xi),$nlo
137
138	ldd	$nhi($Hll),$Tll
139	ldd	$nhi($Hhh),$Thh
140	and	$mask0xf0,$nlo,$nhi
141	depd,z	$nlo,59,4,$nlo
142
143	xor	$Tll,$Zll,$Zll
144	xor	$Thh,$Zhh,$Zhh
145	ldd	$rem($rem_4bit),$rem
146	b	L\$oop_gmult_pa2
147	ldi	13,$cnt
148
149	.ALIGN	8
150L\$oop_gmult_pa2
151	xor	$rem,$Zhh,$Zhh		; moved here to work around gas bug
152	depd,z	$Zll,60,4,$rem
153
154	shrpd	$Zhh,$Zll,4,$Zll
155	extrd,u	$Zhh,59,60,$Zhh
156	ldd	$nlo($Hll),$Tll
157	ldd	$nlo($Hhh),$Thh
158
159	xor	$Tll,$Zll,$Zll
160	xor	$Thh,$Zhh,$Zhh
161	ldd	$rem($rem_4bit),$rem
162
163	xor	$rem,$Zhh,$Zhh
164	depd,z	$Zll,60,4,$rem
165	ldbx	$cnt($Xi),$nlo
166
167	shrpd	$Zhh,$Zll,4,$Zll
168	extrd,u	$Zhh,59,60,$Zhh
169	ldd	$nhi($Hll),$Tll
170	ldd	$nhi($Hhh),$Thh
171
172	and	$mask0xf0,$nlo,$nhi
173	depd,z	$nlo,59,4,$nlo
174	ldd	$rem($rem_4bit),$rem
175
176	xor	$Tll,$Zll,$Zll
177	addib,uv -1,$cnt,L\$oop_gmult_pa2
178	xor	$Thh,$Zhh,$Zhh
179
180	xor	$rem,$Zhh,$Zhh
181	depd,z	$Zll,60,4,$rem
182
183	shrpd	$Zhh,$Zll,4,$Zll
184	extrd,u	$Zhh,59,60,$Zhh
185	ldd	$nlo($Hll),$Tll
186	ldd	$nlo($Hhh),$Thh
187
188	xor	$Tll,$Zll,$Zll
189	xor	$Thh,$Zhh,$Zhh
190	ldd	$rem($rem_4bit),$rem
191
192	xor	$rem,$Zhh,$Zhh
193	depd,z	$Zll,60,4,$rem
194
195	shrpd	$Zhh,$Zll,4,$Zll
196	extrd,u	$Zhh,59,60,$Zhh
197	ldd	$nhi($Hll),$Tll
198	ldd	$nhi($Hhh),$Thh
199
200	xor	$Tll,$Zll,$Zll
201	xor	$Thh,$Zhh,$Zhh
202	ldd	$rem($rem_4bit),$rem
203
204	xor	$rem,$Zhh,$Zhh
205	std	$Zll,8($Xi)
206	std	$Zhh,0($Xi)
207___
208
209$code.=<<___ if ($SIZE_T==4);
210	b	L\$done_gmult
211	nop
212
213L\$parisc1_gmult
214	ldb	15($Xi),$nlo
215	ldo	12($Htbl),$Hll
216	ldo	8($Htbl),$Hlh
217	ldo	4($Htbl),$Hhl
218
219	and	$mask0xf0,$nlo,$nhi
220	zdep	$nlo,27,4,$nlo
221
222	ldwx	$nlo($Hll),$Zll
223	ldwx	$nlo($Hlh),$Zlh
224	ldwx	$nlo($Hhl),$Zhl
225	ldwx	$nlo($Hhh),$Zhh
226	zdep	$Zll,28,4,$rem
227	ldb	14($Xi),$nlo
228	ldwx	$rem($rem_4bit),$rem
229	shrpw	$Zlh,$Zll,4,$Zll
230	ldwx	$nhi($Hll),$Tll
231	shrpw	$Zhl,$Zlh,4,$Zlh
232	ldwx	$nhi($Hlh),$Tlh
233	shrpw	$Zhh,$Zhl,4,$Zhl
234	ldwx	$nhi($Hhl),$Thl
235	extru	$Zhh,27,28,$Zhh
236	ldwx	$nhi($Hhh),$Thh
237	xor	$rem,$Zhh,$Zhh
238	and	$mask0xf0,$nlo,$nhi
239	zdep	$nlo,27,4,$nlo
240
241	xor	$Tll,$Zll,$Zll
242	ldwx	$nlo($Hll),$Tll
243	xor	$Tlh,$Zlh,$Zlh
244	ldwx	$nlo($Hlh),$Tlh
245	xor	$Thl,$Zhl,$Zhl
246	b	L\$oop_gmult_pa1
247	ldi	13,$cnt
248
249	.ALIGN	8
250L\$oop_gmult_pa1
251	zdep	$Zll,28,4,$rem
252	ldwx	$nlo($Hhl),$Thl
253	xor	$Thh,$Zhh,$Zhh
254	ldwx	$rem($rem_4bit),$rem
255	shrpw	$Zlh,$Zll,4,$Zll
256	ldwx	$nlo($Hhh),$Thh
257	shrpw	$Zhl,$Zlh,4,$Zlh
258	ldbx	$cnt($Xi),$nlo
259	xor	$Tll,$Zll,$Zll
260	ldwx	$nhi($Hll),$Tll
261	shrpw	$Zhh,$Zhl,4,$Zhl
262	xor	$Tlh,$Zlh,$Zlh
263	ldwx	$nhi($Hlh),$Tlh
264	extru	$Zhh,27,28,$Zhh
265	xor	$Thl,$Zhl,$Zhl
266	ldwx	$nhi($Hhl),$Thl
267	xor	$rem,$Zhh,$Zhh
268	zdep	$Zll,28,4,$rem
269	xor	$Thh,$Zhh,$Zhh
270	ldwx	$nhi($Hhh),$Thh
271	shrpw	$Zlh,$Zll,4,$Zll
272	ldwx	$rem($rem_4bit),$rem
273	shrpw	$Zhl,$Zlh,4,$Zlh
274	shrpw	$Zhh,$Zhl,4,$Zhl
275	and	$mask0xf0,$nlo,$nhi
276	extru	$Zhh,27,28,$Zhh
277	zdep	$nlo,27,4,$nlo
278	xor	$Tll,$Zll,$Zll
279	ldwx	$nlo($Hll),$Tll
280	xor	$Tlh,$Zlh,$Zlh
281	ldwx	$nlo($Hlh),$Tlh
282	xor	$rem,$Zhh,$Zhh
283	addib,uv -1,$cnt,L\$oop_gmult_pa1
284	xor	$Thl,$Zhl,$Zhl
285
286	zdep	$Zll,28,4,$rem
287	ldwx	$nlo($Hhl),$Thl
288	xor	$Thh,$Zhh,$Zhh
289	ldwx	$rem($rem_4bit),$rem
290	shrpw	$Zlh,$Zll,4,$Zll
291	ldwx	$nlo($Hhh),$Thh
292	shrpw	$Zhl,$Zlh,4,$Zlh
293	xor	$Tll,$Zll,$Zll
294	ldwx	$nhi($Hll),$Tll
295	shrpw	$Zhh,$Zhl,4,$Zhl
296	xor	$Tlh,$Zlh,$Zlh
297	ldwx	$nhi($Hlh),$Tlh
298	extru	$Zhh,27,28,$Zhh
299	xor	$rem,$Zhh,$Zhh
300	xor	$Thl,$Zhl,$Zhl
301	ldwx	$nhi($Hhl),$Thl
302	xor	$Thh,$Zhh,$Zhh
303	ldwx	$nhi($Hhh),$Thh
304	zdep	$Zll,28,4,$rem
305	ldwx	$rem($rem_4bit),$rem
306	shrpw	$Zlh,$Zll,4,$Zll
307	shrpw	$Zhl,$Zlh,4,$Zlh
308	shrpw	$Zhh,$Zhl,4,$Zhl
309	extru	$Zhh,27,28,$Zhh
310	xor	$Tll,$Zll,$Zll
311	xor	$Tlh,$Zlh,$Zlh
312	xor	$rem,$Zhh,$Zhh
313	stw	$Zll,12($Xi)
314	xor	$Thl,$Zhl,$Zhl
315	stw	$Zlh,8($Xi)
316	xor	$Thh,$Zhh,$Zhh
317	stw	$Zhl,4($Xi)
318	stw	$Zhh,0($Xi)
319___
320$code.=<<___;
321L\$done_gmult
322	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
323	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
324	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
325	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
326___
327$code.=<<___ if ($SIZE_T==4);
328	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
329	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
330	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
331	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
332	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
333___
334$code.=<<___;
335	bv	(%r2)
336	.EXIT
337	$POPMB	-$FRAME(%sp),%r3
338	.PROCEND
339
340	.EXPORT	gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
341	.ALIGN	64
342gcm_ghash_4bit
343	.PROC
344	.CALLINFO	FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
345	.ENTRY
346	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
347	$PUSHMA	%r3,$FRAME(%sp)
348	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
349	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
350	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
351___
352$code.=<<___ if ($SIZE_T==4);
353	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
354	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
355	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
356	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
357	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
358___
359$code.=<<___;
360	blr	%r0,$rem_4bit
361	ldi	3,$rem
362L\$pic_ghash
363	andcm	$rem_4bit,$rem,$rem_4bit
364	addl	$inp,$len,$len
365	ldo	L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
366	ldi	0xf0,$mask0xf0
367___
368$code.=<<___ if ($SIZE_T==4);
369	ldi	31,$rem
370	mtctl	$rem,%cr11
371	extrd,u,*= $rem,%sar,1,$rem	; executes on PA-RISC 1.0
372	b	L\$parisc1_ghash
373	nop
374___
375
376$code.=<<___;
377	ldb	15($Xi),$nlo
378	ldo	8($Htbl),$Hll
379
380L\$outer_ghash_pa2
381	ldb	15($inp),$nhi
382	xor	$nhi,$nlo,$nlo
383	and	$mask0xf0,$nlo,$nhi
384	depd,z	$nlo,59,4,$nlo
385
386	ldd	$nlo($Hll),$Zll
387	ldd	$nlo($Hhh),$Zhh
388
389	depd,z	$Zll,60,4,$rem
390	shrpd	$Zhh,$Zll,4,$Zll
391	extrd,u	$Zhh,59,60,$Zhh
392	ldb	14($Xi),$nlo
393	ldb	14($inp),$byte
394
395	ldd	$nhi($Hll),$Tll
396	ldd	$nhi($Hhh),$Thh
397	xor	$byte,$nlo,$nlo
398	and	$mask0xf0,$nlo,$nhi
399	depd,z	$nlo,59,4,$nlo
400
401	xor	$Tll,$Zll,$Zll
402	xor	$Thh,$Zhh,$Zhh
403	ldd	$rem($rem_4bit),$rem
404	b	L\$oop_ghash_pa2
405	ldi	13,$cnt
406
407	.ALIGN	8
408L\$oop_ghash_pa2
409	xor	$rem,$Zhh,$Zhh		; moved here to work around gas bug
410	depd,z	$Zll,60,4,$rem2
411
412	shrpd	$Zhh,$Zll,4,$Zll
413	extrd,u	$Zhh,59,60,$Zhh
414	ldd	$nlo($Hll),$Tll
415	ldd	$nlo($Hhh),$Thh
416
417	xor	$Tll,$Zll,$Zll
418	xor	$Thh,$Zhh,$Zhh
419	ldbx	$cnt($Xi),$nlo
420	ldbx	$cnt($inp),$byte
421
422	depd,z	$Zll,60,4,$rem
423	shrpd	$Zhh,$Zll,4,$Zll
424	ldd	$rem2($rem_4bit),$rem2
425
426	xor	$rem2,$Zhh,$Zhh
427	xor	$byte,$nlo,$nlo
428	ldd	$nhi($Hll),$Tll
429	ldd	$nhi($Hhh),$Thh
430
431	and	$mask0xf0,$nlo,$nhi
432	depd,z	$nlo,59,4,$nlo
433
434	extrd,u	$Zhh,59,60,$Zhh
435	xor	$Tll,$Zll,$Zll
436
437	ldd	$rem($rem_4bit),$rem
438	addib,uv -1,$cnt,L\$oop_ghash_pa2
439	xor	$Thh,$Zhh,$Zhh
440
441	xor	$rem,$Zhh,$Zhh
442	depd,z	$Zll,60,4,$rem2
443
444	shrpd	$Zhh,$Zll,4,$Zll
445	extrd,u	$Zhh,59,60,$Zhh
446	ldd	$nlo($Hll),$Tll
447	ldd	$nlo($Hhh),$Thh
448
449	xor	$Tll,$Zll,$Zll
450	xor	$Thh,$Zhh,$Zhh
451
452	depd,z	$Zll,60,4,$rem
453	shrpd	$Zhh,$Zll,4,$Zll
454	ldd	$rem2($rem_4bit),$rem2
455
456	xor	$rem2,$Zhh,$Zhh
457	ldd	$nhi($Hll),$Tll
458	ldd	$nhi($Hhh),$Thh
459
460	extrd,u	$Zhh,59,60,$Zhh
461	xor	$Tll,$Zll,$Zll
462	xor	$Thh,$Zhh,$Zhh
463	ldd	$rem($rem_4bit),$rem
464
465	xor	$rem,$Zhh,$Zhh
466	std	$Zll,8($Xi)
467	ldo	16($inp),$inp
468	std	$Zhh,0($Xi)
469	cmpb,*<> $inp,$len,L\$outer_ghash_pa2
470	copy	$Zll,$nlo
471___
472
473$code.=<<___ if ($SIZE_T==4);
474	b	L\$done_ghash
475	nop
476
477L\$parisc1_ghash
478	ldb	15($Xi),$nlo
479	ldo	12($Htbl),$Hll
480	ldo	8($Htbl),$Hlh
481	ldo	4($Htbl),$Hhl
482
483L\$outer_ghash_pa1
484	ldb	15($inp),$byte
485	xor	$byte,$nlo,$nlo
486	and	$mask0xf0,$nlo,$nhi
487	zdep	$nlo,27,4,$nlo
488
489	ldwx	$nlo($Hll),$Zll
490	ldwx	$nlo($Hlh),$Zlh
491	ldwx	$nlo($Hhl),$Zhl
492	ldwx	$nlo($Hhh),$Zhh
493	zdep	$Zll,28,4,$rem
494	ldb	14($Xi),$nlo
495	ldb	14($inp),$byte
496	ldwx	$rem($rem_4bit),$rem
497	shrpw	$Zlh,$Zll,4,$Zll
498	ldwx	$nhi($Hll),$Tll
499	shrpw	$Zhl,$Zlh,4,$Zlh
500	ldwx	$nhi($Hlh),$Tlh
501	shrpw	$Zhh,$Zhl,4,$Zhl
502	ldwx	$nhi($Hhl),$Thl
503	extru	$Zhh,27,28,$Zhh
504	ldwx	$nhi($Hhh),$Thh
505	xor	$byte,$nlo,$nlo
506	xor	$rem,$Zhh,$Zhh
507	and	$mask0xf0,$nlo,$nhi
508	zdep	$nlo,27,4,$nlo
509
510	xor	$Tll,$Zll,$Zll
511	ldwx	$nlo($Hll),$Tll
512	xor	$Tlh,$Zlh,$Zlh
513	ldwx	$nlo($Hlh),$Tlh
514	xor	$Thl,$Zhl,$Zhl
515	b	L\$oop_ghash_pa1
516	ldi	13,$cnt
517
518	.ALIGN	8
519L\$oop_ghash_pa1
520	zdep	$Zll,28,4,$rem
521	ldwx	$nlo($Hhl),$Thl
522	xor	$Thh,$Zhh,$Zhh
523	ldwx	$rem($rem_4bit),$rem
524	shrpw	$Zlh,$Zll,4,$Zll
525	ldwx	$nlo($Hhh),$Thh
526	shrpw	$Zhl,$Zlh,4,$Zlh
527	ldbx	$cnt($Xi),$nlo
528	xor	$Tll,$Zll,$Zll
529	ldwx	$nhi($Hll),$Tll
530	shrpw	$Zhh,$Zhl,4,$Zhl
531	ldbx	$cnt($inp),$byte
532	xor	$Tlh,$Zlh,$Zlh
533	ldwx	$nhi($Hlh),$Tlh
534	extru	$Zhh,27,28,$Zhh
535	xor	$Thl,$Zhl,$Zhl
536	ldwx	$nhi($Hhl),$Thl
537	xor	$rem,$Zhh,$Zhh
538	zdep	$Zll,28,4,$rem
539	xor	$Thh,$Zhh,$Zhh
540	ldwx	$nhi($Hhh),$Thh
541	shrpw	$Zlh,$Zll,4,$Zll
542	ldwx	$rem($rem_4bit),$rem
543	shrpw	$Zhl,$Zlh,4,$Zlh
544	xor	$byte,$nlo,$nlo
545	shrpw	$Zhh,$Zhl,4,$Zhl
546	and	$mask0xf0,$nlo,$nhi
547	extru	$Zhh,27,28,$Zhh
548	zdep	$nlo,27,4,$nlo
549	xor	$Tll,$Zll,$Zll
550	ldwx	$nlo($Hll),$Tll
551	xor	$Tlh,$Zlh,$Zlh
552	ldwx	$nlo($Hlh),$Tlh
553	xor	$rem,$Zhh,$Zhh
554	addib,uv -1,$cnt,L\$oop_ghash_pa1
555	xor	$Thl,$Zhl,$Zhl
556
557	zdep	$Zll,28,4,$rem
558	ldwx	$nlo($Hhl),$Thl
559	xor	$Thh,$Zhh,$Zhh
560	ldwx	$rem($rem_4bit),$rem
561	shrpw	$Zlh,$Zll,4,$Zll
562	ldwx	$nlo($Hhh),$Thh
563	shrpw	$Zhl,$Zlh,4,$Zlh
564	xor	$Tll,$Zll,$Zll
565	ldwx	$nhi($Hll),$Tll
566	shrpw	$Zhh,$Zhl,4,$Zhl
567	xor	$Tlh,$Zlh,$Zlh
568	ldwx	$nhi($Hlh),$Tlh
569	extru	$Zhh,27,28,$Zhh
570	xor	$rem,$Zhh,$Zhh
571	xor	$Thl,$Zhl,$Zhl
572	ldwx	$nhi($Hhl),$Thl
573	xor	$Thh,$Zhh,$Zhh
574	ldwx	$nhi($Hhh),$Thh
575	zdep	$Zll,28,4,$rem
576	ldwx	$rem($rem_4bit),$rem
577	shrpw	$Zlh,$Zll,4,$Zll
578	shrpw	$Zhl,$Zlh,4,$Zlh
579	shrpw	$Zhh,$Zhl,4,$Zhl
580	extru	$Zhh,27,28,$Zhh
581	xor	$Tll,$Zll,$Zll
582	xor	$Tlh,$Zlh,$Zlh
583	xor	$rem,$Zhh,$Zhh
584	stw	$Zll,12($Xi)
585	xor	$Thl,$Zhl,$Zhl
586	stw	$Zlh,8($Xi)
587	xor	$Thh,$Zhh,$Zhh
588	stw	$Zhl,4($Xi)
589	ldo	16($inp),$inp
590	stw	$Zhh,0($Xi)
591	comb,<>	$inp,$len,L\$outer_ghash_pa1
592	copy	$Zll,$nlo
593___
594$code.=<<___;
595L\$done_ghash
596	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
597	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
598	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
599	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
600___
601$code.=<<___ if ($SIZE_T==4);
602	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
603	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
604	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
605	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
606	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
607___
608$code.=<<___;
609	bv	(%r2)
610	.EXIT
611	$POPMB	-$FRAME(%sp),%r3
612	.PROCEND
613
614	.ALIGN	64
615L\$rem_4bit
616	.WORD	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
617	.WORD	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
618	.WORD	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
619	.WORD	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
620	.STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
621	.ALIGN	64
622___
623
624# Explicitly encode PA-RISC 2.0 instructions used in this module, so
625# that it can be compiled with .LEVEL 1.0. It should be noted that I
626# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
627# directive...
628
629my $ldd = sub {
630  my ($mod,$args) = @_;
631  my $orig = "ldd$mod\t$args";
632
633    if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)		# format 4
634    {	my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
635	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
636    }
637    elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)	# format 5
638    {	my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
639	$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);		# encode offset
640	$opcode|=(1<<5)  if ($mod =~ /^,m/);
641	$opcode|=(1<<13) if ($mod =~ /^,mb/);
642	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
643    }
644    else { "\t".$orig; }
645};
646
647my $std = sub {
648  my ($mod,$args) = @_;
649  my $orig = "std$mod\t$args";
650
651    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
652    {	my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
653	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
654    }
655    else { "\t".$orig; }
656};
657
658my $extrd = sub {
659  my ($mod,$args) = @_;
660  my $orig = "extrd$mod\t$args";
661
662    # I only have ",u" completer, it's implicitly encoded...
663    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
664    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
665	my $len=32-$3;
666	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
667	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
668	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
669    }
670    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
671    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
672	my $len=32-$2;
673	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
674	$opcode |= (1<<13) if ($mod =~ /,\**=/);
675	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
676    }
677    else { "\t".$orig; }
678};
679
680my $shrpd = sub {
681  my ($mod,$args) = @_;
682  my $orig = "shrpd$mod\t$args";
683
684    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
685    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
686	my $cpos=63-$3;
687	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
688	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
689    }
690    elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)	# format 11
691    {	sprintf "\t.WORD\t0x%08x\t; %s",
692		(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
693    }
694    else { "\t".$orig; }
695};
696
697my $depd = sub {
698  my ($mod,$args) = @_;
699  my $orig = "depd$mod\t$args";
700
701    # I only have ",z" completer, it's impicitly encoded...
702    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 16
703    {	my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
704    	my $cpos=63-$2;
705	my $len=32-$3;
706	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode pos
707	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
708	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
709    }
710    else { "\t".$orig; }
711};
712
713sub assemble {
714  my ($mnemonic,$mod,$args)=@_;
715  my $opcode = eval("\$$mnemonic");
716
717    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
718}
719
720foreach (split("\n",$code)) {
721	s/\`([^\`]*)\`/eval $1/ge;
722	if ($SIZE_T==4) {
723		s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
724		s/cmpb,\*/comb,/;
725		s/,\*/,/;
726	}
727	s/\bbv\b/bve/	if ($SIZE_T==8);
728	print $_,"\n";
729}
730
731close STDOUT;
732