1238384Sjkim#!/usr/bin/env perl
2238384Sjkim#
3238384Sjkim# ====================================================================
4238384Sjkim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and
6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further
7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/.
8238384Sjkim# ====================================================================
9238384Sjkim#
10238384Sjkim# April 2010
11238384Sjkim#
12238384Sjkim# The module implements "4-bit" GCM GHASH function and underlying
13238384Sjkim# single multiplication operation in GF(2^128). "4-bit" means that it
14238384Sjkim# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
15238384Sjkim# it processes one byte in 19.6 cycles, which is more than twice as
16238384Sjkim# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
17238384Sjkim# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
18238384Sjkim# processed byte. This is ~2.2x faster than 64-bit code generated by
19238384Sjkim# vendor compiler (which used to be very hard to beat:-).
20238384Sjkim#
21238384Sjkim# Special thanks to polarhome.com for providing HP-UX account.
22238384Sjkim
23238384Sjkim$flavour = shift;
24238384Sjkim$output = shift;
25238384Sjkimopen STDOUT,">$output";
26238384Sjkim
27238384Sjkimif ($flavour =~ /64/) {
28238384Sjkim	$LEVEL		="2.0W";
29238384Sjkim	$SIZE_T		=8;
30238384Sjkim	$FRAME_MARKER	=80;
31238384Sjkim	$SAVED_RP	=16;
32238384Sjkim	$PUSH		="std";
33238384Sjkim	$PUSHMA		="std,ma";
34238384Sjkim	$POP		="ldd";
35238384Sjkim	$POPMB		="ldd,mb";
36238384Sjkim	$NREGS		=6;
37238384Sjkim} else {
38238384Sjkim	$LEVEL		="1.0";	#"\n\t.ALLOW\t2.0";
39238384Sjkim	$SIZE_T		=4;
40238384Sjkim	$FRAME_MARKER	=48;
41238384Sjkim	$SAVED_RP	=20;
42238384Sjkim	$PUSH		="stw";
43238384Sjkim	$PUSHMA		="stwm";
44238384Sjkim	$POP		="ldw";
45238384Sjkim	$POPMB		="ldwm";
46238384Sjkim	$NREGS		=11;
47238384Sjkim}
48238384Sjkim
49238384Sjkim$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
50238384Sjkim				#                 [+ argument transfer]
51238384Sjkim
52238384Sjkim################# volatile registers
53238384Sjkim$Xi="%r26";	# argument block
54238384Sjkim$Htbl="%r25";
55238384Sjkim$inp="%r24";
56238384Sjkim$len="%r23";
57238384Sjkim$Hhh=$Htbl;	# variables
58238384Sjkim$Hll="%r22";
59238384Sjkim$Zhh="%r21";
60238384Sjkim$Zll="%r20";
61238384Sjkim$cnt="%r19";
62238384Sjkim$rem_4bit="%r28";
63238384Sjkim$rem="%r29";
64238384Sjkim$mask0xf0="%r31";
65238384Sjkim
66238384Sjkim################# preserved registers
67238384Sjkim$Thh="%r1";
68238384Sjkim$Tll="%r2";
69238384Sjkim$nlo="%r3";
70238384Sjkim$nhi="%r4";
71238384Sjkim$byte="%r5";
72238384Sjkimif ($SIZE_T==4) {
73238384Sjkim	$Zhl="%r6";
74238384Sjkim	$Zlh="%r7";
75238384Sjkim	$Hhl="%r8";
76238384Sjkim	$Hlh="%r9";
77238384Sjkim	$Thl="%r10";
78238384Sjkim	$Tlh="%r11";
79238384Sjkim}
80238384Sjkim$rem2="%r6";	# used in PA-RISC 2.0 code
81238384Sjkim
82238384Sjkim$code.=<<___;
83238384Sjkim	.LEVEL	$LEVEL
84238384Sjkim	.SPACE	\$TEXT\$
85238384Sjkim	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
86238384Sjkim
87238384Sjkim	.EXPORT	gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
88238384Sjkim	.ALIGN	64
89238384Sjkimgcm_gmult_4bit
90238384Sjkim	.PROC
91238384Sjkim	.CALLINFO	FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
92238384Sjkim	.ENTRY
93238384Sjkim	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
94238384Sjkim	$PUSHMA	%r3,$FRAME(%sp)
95238384Sjkim	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
96238384Sjkim	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
97238384Sjkim	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
98238384Sjkim___
99238384Sjkim$code.=<<___ if ($SIZE_T==4);
100238384Sjkim	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
101238384Sjkim	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
102238384Sjkim	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
103238384Sjkim	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
104238384Sjkim	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
105238384Sjkim___
106238384Sjkim$code.=<<___;
107238384Sjkim	blr	%r0,$rem_4bit
108238384Sjkim	ldi	3,$rem
109238384SjkimL\$pic_gmult
110238384Sjkim	andcm	$rem_4bit,$rem,$rem_4bit
111238384Sjkim	addl	$inp,$len,$len
112238384Sjkim	ldo	L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
113238384Sjkim	ldi	0xf0,$mask0xf0
114238384Sjkim___
115238384Sjkim$code.=<<___ if ($SIZE_T==4);
116238384Sjkim	ldi	31,$rem
117238384Sjkim	mtctl	$rem,%cr11
118238384Sjkim	extrd,u,*= $rem,%sar,1,$rem	; executes on PA-RISC 1.0
119238384Sjkim	b	L\$parisc1_gmult
120238384Sjkim	nop
121238384Sjkim___
122238384Sjkim
123238384Sjkim$code.=<<___;
124238384Sjkim	ldb	15($Xi),$nlo
125238384Sjkim	ldo	8($Htbl),$Hll
126238384Sjkim
127238384Sjkim	and	$mask0xf0,$nlo,$nhi
128238384Sjkim	depd,z	$nlo,59,4,$nlo
129238384Sjkim
130238384Sjkim	ldd	$nlo($Hll),$Zll
131238384Sjkim	ldd	$nlo($Hhh),$Zhh
132238384Sjkim
133238384Sjkim	depd,z	$Zll,60,4,$rem
134238384Sjkim	shrpd	$Zhh,$Zll,4,$Zll
135238384Sjkim	extrd,u	$Zhh,59,60,$Zhh
136238384Sjkim	ldb	14($Xi),$nlo
137238384Sjkim
138238384Sjkim	ldd	$nhi($Hll),$Tll
139238384Sjkim	ldd	$nhi($Hhh),$Thh
140238384Sjkim	and	$mask0xf0,$nlo,$nhi
141238384Sjkim	depd,z	$nlo,59,4,$nlo
142238384Sjkim
143238384Sjkim	xor	$Tll,$Zll,$Zll
144238384Sjkim	xor	$Thh,$Zhh,$Zhh
145238384Sjkim	ldd	$rem($rem_4bit),$rem
146238384Sjkim	b	L\$oop_gmult_pa2
147238384Sjkim	ldi	13,$cnt
148238384Sjkim
149238384Sjkim	.ALIGN	8
150238384SjkimL\$oop_gmult_pa2
151238384Sjkim	xor	$rem,$Zhh,$Zhh		; moved here to work around gas bug
152238384Sjkim	depd,z	$Zll,60,4,$rem
153238384Sjkim
154238384Sjkim	shrpd	$Zhh,$Zll,4,$Zll
155238384Sjkim	extrd,u	$Zhh,59,60,$Zhh
156238384Sjkim	ldd	$nlo($Hll),$Tll
157238384Sjkim	ldd	$nlo($Hhh),$Thh
158238384Sjkim
159238384Sjkim	xor	$Tll,$Zll,$Zll
160238384Sjkim	xor	$Thh,$Zhh,$Zhh
161238384Sjkim	ldd	$rem($rem_4bit),$rem
162238384Sjkim
163238384Sjkim	xor	$rem,$Zhh,$Zhh
164238384Sjkim	depd,z	$Zll,60,4,$rem
165238384Sjkim	ldbx	$cnt($Xi),$nlo
166238384Sjkim
167238384Sjkim	shrpd	$Zhh,$Zll,4,$Zll
168238384Sjkim	extrd,u	$Zhh,59,60,$Zhh
169238384Sjkim	ldd	$nhi($Hll),$Tll
170238384Sjkim	ldd	$nhi($Hhh),$Thh
171238384Sjkim
172238384Sjkim	and	$mask0xf0,$nlo,$nhi
173238384Sjkim	depd,z	$nlo,59,4,$nlo
174238384Sjkim	ldd	$rem($rem_4bit),$rem
175238384Sjkim
176238384Sjkim	xor	$Tll,$Zll,$Zll
177238384Sjkim	addib,uv -1,$cnt,L\$oop_gmult_pa2
178238384Sjkim	xor	$Thh,$Zhh,$Zhh
179238384Sjkim
180238384Sjkim	xor	$rem,$Zhh,$Zhh
181238384Sjkim	depd,z	$Zll,60,4,$rem
182238384Sjkim
183238384Sjkim	shrpd	$Zhh,$Zll,4,$Zll
184238384Sjkim	extrd,u	$Zhh,59,60,$Zhh
185238384Sjkim	ldd	$nlo($Hll),$Tll
186238384Sjkim	ldd	$nlo($Hhh),$Thh
187238384Sjkim
188238384Sjkim	xor	$Tll,$Zll,$Zll
189238384Sjkim	xor	$Thh,$Zhh,$Zhh
190238384Sjkim	ldd	$rem($rem_4bit),$rem
191238384Sjkim
192238384Sjkim	xor	$rem,$Zhh,$Zhh
193238384Sjkim	depd,z	$Zll,60,4,$rem
194238384Sjkim
195238384Sjkim	shrpd	$Zhh,$Zll,4,$Zll
196238384Sjkim	extrd,u	$Zhh,59,60,$Zhh
197238384Sjkim	ldd	$nhi($Hll),$Tll
198238384Sjkim	ldd	$nhi($Hhh),$Thh
199238384Sjkim
200238384Sjkim	xor	$Tll,$Zll,$Zll
201238384Sjkim	xor	$Thh,$Zhh,$Zhh
202238384Sjkim	ldd	$rem($rem_4bit),$rem
203238384Sjkim
204238384Sjkim	xor	$rem,$Zhh,$Zhh
205238384Sjkim	std	$Zll,8($Xi)
206238384Sjkim	std	$Zhh,0($Xi)
207238384Sjkim___
208238384Sjkim
209238384Sjkim$code.=<<___ if ($SIZE_T==4);
210238384Sjkim	b	L\$done_gmult
211238384Sjkim	nop
212238384Sjkim
213238384SjkimL\$parisc1_gmult
214238384Sjkim	ldb	15($Xi),$nlo
215238384Sjkim	ldo	12($Htbl),$Hll
216238384Sjkim	ldo	8($Htbl),$Hlh
217238384Sjkim	ldo	4($Htbl),$Hhl
218238384Sjkim
219238384Sjkim	and	$mask0xf0,$nlo,$nhi
220238384Sjkim	zdep	$nlo,27,4,$nlo
221238384Sjkim
222238384Sjkim	ldwx	$nlo($Hll),$Zll
223238384Sjkim	ldwx	$nlo($Hlh),$Zlh
224238384Sjkim	ldwx	$nlo($Hhl),$Zhl
225238384Sjkim	ldwx	$nlo($Hhh),$Zhh
226238384Sjkim	zdep	$Zll,28,4,$rem
227238384Sjkim	ldb	14($Xi),$nlo
228238384Sjkim	ldwx	$rem($rem_4bit),$rem
229238384Sjkim	shrpw	$Zlh,$Zll,4,$Zll
230238384Sjkim	ldwx	$nhi($Hll),$Tll
231238384Sjkim	shrpw	$Zhl,$Zlh,4,$Zlh
232238384Sjkim	ldwx	$nhi($Hlh),$Tlh
233238384Sjkim	shrpw	$Zhh,$Zhl,4,$Zhl
234238384Sjkim	ldwx	$nhi($Hhl),$Thl
235238384Sjkim	extru	$Zhh,27,28,$Zhh
236238384Sjkim	ldwx	$nhi($Hhh),$Thh
237238384Sjkim	xor	$rem,$Zhh,$Zhh
238238384Sjkim	and	$mask0xf0,$nlo,$nhi
239238384Sjkim	zdep	$nlo,27,4,$nlo
240238384Sjkim
241238384Sjkim	xor	$Tll,$Zll,$Zll
242238384Sjkim	ldwx	$nlo($Hll),$Tll
243238384Sjkim	xor	$Tlh,$Zlh,$Zlh
244238384Sjkim	ldwx	$nlo($Hlh),$Tlh
245238384Sjkim	xor	$Thl,$Zhl,$Zhl
246238384Sjkim	b	L\$oop_gmult_pa1
247238384Sjkim	ldi	13,$cnt
248238384Sjkim
249238384Sjkim	.ALIGN	8
250238384SjkimL\$oop_gmult_pa1
251238384Sjkim	zdep	$Zll,28,4,$rem
252238384Sjkim	ldwx	$nlo($Hhl),$Thl
253238384Sjkim	xor	$Thh,$Zhh,$Zhh
254238384Sjkim	ldwx	$rem($rem_4bit),$rem
255238384Sjkim	shrpw	$Zlh,$Zll,4,$Zll
256238384Sjkim	ldwx	$nlo($Hhh),$Thh
257238384Sjkim	shrpw	$Zhl,$Zlh,4,$Zlh
258238384Sjkim	ldbx	$cnt($Xi),$nlo
259238384Sjkim	xor	$Tll,$Zll,$Zll
260238384Sjkim	ldwx	$nhi($Hll),$Tll
261238384Sjkim	shrpw	$Zhh,$Zhl,4,$Zhl
262238384Sjkim	xor	$Tlh,$Zlh,$Zlh
263238384Sjkim	ldwx	$nhi($Hlh),$Tlh
264238384Sjkim	extru	$Zhh,27,28,$Zhh
265238384Sjkim	xor	$Thl,$Zhl,$Zhl
266238384Sjkim	ldwx	$nhi($Hhl),$Thl
267238384Sjkim	xor	$rem,$Zhh,$Zhh
268238384Sjkim	zdep	$Zll,28,4,$rem
269238384Sjkim	xor	$Thh,$Zhh,$Zhh
270238384Sjkim	ldwx	$nhi($Hhh),$Thh
271238384Sjkim	shrpw	$Zlh,$Zll,4,$Zll
272238384Sjkim	ldwx	$rem($rem_4bit),$rem
273238384Sjkim	shrpw	$Zhl,$Zlh,4,$Zlh
274238384Sjkim	shrpw	$Zhh,$Zhl,4,$Zhl
275238384Sjkim	and	$mask0xf0,$nlo,$nhi
276238384Sjkim	extru	$Zhh,27,28,$Zhh
277238384Sjkim	zdep	$nlo,27,4,$nlo
278238384Sjkim	xor	$Tll,$Zll,$Zll
279238384Sjkim	ldwx	$nlo($Hll),$Tll
280238384Sjkim	xor	$Tlh,$Zlh,$Zlh
281238384Sjkim	ldwx	$nlo($Hlh),$Tlh
282238384Sjkim	xor	$rem,$Zhh,$Zhh
283238384Sjkim	addib,uv -1,$cnt,L\$oop_gmult_pa1
284238384Sjkim	xor	$Thl,$Zhl,$Zhl
285238384Sjkim
286238384Sjkim	zdep	$Zll,28,4,$rem
287238384Sjkim	ldwx	$nlo($Hhl),$Thl
288238384Sjkim	xor	$Thh,$Zhh,$Zhh
289238384Sjkim	ldwx	$rem($rem_4bit),$rem
290238384Sjkim	shrpw	$Zlh,$Zll,4,$Zll
291238384Sjkim	ldwx	$nlo($Hhh),$Thh
292238384Sjkim	shrpw	$Zhl,$Zlh,4,$Zlh
293238384Sjkim	xor	$Tll,$Zll,$Zll
294238384Sjkim	ldwx	$nhi($Hll),$Tll
295238384Sjkim	shrpw	$Zhh,$Zhl,4,$Zhl
296238384Sjkim	xor	$Tlh,$Zlh,$Zlh
297238384Sjkim	ldwx	$nhi($Hlh),$Tlh
298238384Sjkim	extru	$Zhh,27,28,$Zhh
299238384Sjkim	xor	$rem,$Zhh,$Zhh
300238384Sjkim	xor	$Thl,$Zhl,$Zhl
301238384Sjkim	ldwx	$nhi($Hhl),$Thl
302238384Sjkim	xor	$Thh,$Zhh,$Zhh
303238384Sjkim	ldwx	$nhi($Hhh),$Thh
304238384Sjkim	zdep	$Zll,28,4,$rem
305238384Sjkim	ldwx	$rem($rem_4bit),$rem
306238384Sjkim	shrpw	$Zlh,$Zll,4,$Zll
307238384Sjkim	shrpw	$Zhl,$Zlh,4,$Zlh
308238384Sjkim	shrpw	$Zhh,$Zhl,4,$Zhl
309238384Sjkim	extru	$Zhh,27,28,$Zhh
310238384Sjkim	xor	$Tll,$Zll,$Zll
311238384Sjkim	xor	$Tlh,$Zlh,$Zlh
312238384Sjkim	xor	$rem,$Zhh,$Zhh
313238384Sjkim	stw	$Zll,12($Xi)
314238384Sjkim	xor	$Thl,$Zhl,$Zhl
315238384Sjkim	stw	$Zlh,8($Xi)
316238384Sjkim	xor	$Thh,$Zhh,$Zhh
317238384Sjkim	stw	$Zhl,4($Xi)
318238384Sjkim	stw	$Zhh,0($Xi)
319238384Sjkim___
320238384Sjkim$code.=<<___;
321238384SjkimL\$done_gmult
322238384Sjkim	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
323238384Sjkim	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
324238384Sjkim	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
325238384Sjkim	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
326238384Sjkim___
327238384Sjkim$code.=<<___ if ($SIZE_T==4);
328238384Sjkim	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
329238384Sjkim	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
330238384Sjkim	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
331238384Sjkim	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
332238384Sjkim	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
333238384Sjkim___
334238384Sjkim$code.=<<___;
335238384Sjkim	bv	(%r2)
336238384Sjkim	.EXIT
337238384Sjkim	$POPMB	-$FRAME(%sp),%r3
338238384Sjkim	.PROCEND
339238384Sjkim
340238384Sjkim	.EXPORT	gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
341238384Sjkim	.ALIGN	64
342238384Sjkimgcm_ghash_4bit
343238384Sjkim	.PROC
344238384Sjkim	.CALLINFO	FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
345238384Sjkim	.ENTRY
346238384Sjkim	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
347238384Sjkim	$PUSHMA	%r3,$FRAME(%sp)
348238384Sjkim	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
349238384Sjkim	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
350238384Sjkim	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
351238384Sjkim___
352238384Sjkim$code.=<<___ if ($SIZE_T==4);
353238384Sjkim	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
354238384Sjkim	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
355238384Sjkim	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
356238384Sjkim	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
357238384Sjkim	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
358238384Sjkim___
359238384Sjkim$code.=<<___;
360238384Sjkim	blr	%r0,$rem_4bit
361238384Sjkim	ldi	3,$rem
362238384SjkimL\$pic_ghash
363238384Sjkim	andcm	$rem_4bit,$rem,$rem_4bit
364238384Sjkim	addl	$inp,$len,$len
365238384Sjkim	ldo	L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
366238384Sjkim	ldi	0xf0,$mask0xf0
367238384Sjkim___
368238384Sjkim$code.=<<___ if ($SIZE_T==4);
369238384Sjkim	ldi	31,$rem
370238384Sjkim	mtctl	$rem,%cr11
371238384Sjkim	extrd,u,*= $rem,%sar,1,$rem	; executes on PA-RISC 1.0
372238384Sjkim	b	L\$parisc1_ghash
373238384Sjkim	nop
374238384Sjkim___
375238384Sjkim
376238384Sjkim$code.=<<___;
377238384Sjkim	ldb	15($Xi),$nlo
378238384Sjkim	ldo	8($Htbl),$Hll
379238384Sjkim
380238384SjkimL\$outer_ghash_pa2
381238384Sjkim	ldb	15($inp),$nhi
382238384Sjkim	xor	$nhi,$nlo,$nlo
383238384Sjkim	and	$mask0xf0,$nlo,$nhi
384238384Sjkim	depd,z	$nlo,59,4,$nlo
385238384Sjkim
386238384Sjkim	ldd	$nlo($Hll),$Zll
387238384Sjkim	ldd	$nlo($Hhh),$Zhh
388238384Sjkim
389238384Sjkim	depd,z	$Zll,60,4,$rem
390238384Sjkim	shrpd	$Zhh,$Zll,4,$Zll
391238384Sjkim	extrd,u	$Zhh,59,60,$Zhh
392238384Sjkim	ldb	14($Xi),$nlo
393238384Sjkim	ldb	14($inp),$byte
394238384Sjkim
395238384Sjkim	ldd	$nhi($Hll),$Tll
396238384Sjkim	ldd	$nhi($Hhh),$Thh
397238384Sjkim	xor	$byte,$nlo,$nlo
398238384Sjkim	and	$mask0xf0,$nlo,$nhi
399238384Sjkim	depd,z	$nlo,59,4,$nlo
400238384Sjkim
401238384Sjkim	xor	$Tll,$Zll,$Zll
402238384Sjkim	xor	$Thh,$Zhh,$Zhh
403238384Sjkim	ldd	$rem($rem_4bit),$rem
404238384Sjkim	b	L\$oop_ghash_pa2
405238384Sjkim	ldi	13,$cnt
406238384Sjkim
407238384Sjkim	.ALIGN	8
408238384SjkimL\$oop_ghash_pa2
409238384Sjkim	xor	$rem,$Zhh,$Zhh		; moved here to work around gas bug
410238384Sjkim	depd,z	$Zll,60,4,$rem2
411238384Sjkim
412238384Sjkim	shrpd	$Zhh,$Zll,4,$Zll
413238384Sjkim	extrd,u	$Zhh,59,60,$Zhh
414238384Sjkim	ldd	$nlo($Hll),$Tll
415238384Sjkim	ldd	$nlo($Hhh),$Thh
416238384Sjkim
417238384Sjkim	xor	$Tll,$Zll,$Zll
418238384Sjkim	xor	$Thh,$Zhh,$Zhh
419238384Sjkim	ldbx	$cnt($Xi),$nlo
420238384Sjkim	ldbx	$cnt($inp),$byte
421238384Sjkim
422238384Sjkim	depd,z	$Zll,60,4,$rem
423238384Sjkim	shrpd	$Zhh,$Zll,4,$Zll
424238384Sjkim	ldd	$rem2($rem_4bit),$rem2
425238384Sjkim
426238384Sjkim	xor	$rem2,$Zhh,$Zhh
427238384Sjkim	xor	$byte,$nlo,$nlo
428238384Sjkim	ldd	$nhi($Hll),$Tll
429238384Sjkim	ldd	$nhi($Hhh),$Thh
430238384Sjkim
431238384Sjkim	and	$mask0xf0,$nlo,$nhi
432238384Sjkim	depd,z	$nlo,59,4,$nlo
433238384Sjkim
434238384Sjkim	extrd,u	$Zhh,59,60,$Zhh
435238384Sjkim	xor	$Tll,$Zll,$Zll
436238384Sjkim
437238384Sjkim	ldd	$rem($rem_4bit),$rem
438238384Sjkim	addib,uv -1,$cnt,L\$oop_ghash_pa2
439238384Sjkim	xor	$Thh,$Zhh,$Zhh
440238384Sjkim
441238384Sjkim	xor	$rem,$Zhh,$Zhh
442238384Sjkim	depd,z	$Zll,60,4,$rem2
443238384Sjkim
444238384Sjkim	shrpd	$Zhh,$Zll,4,$Zll
445238384Sjkim	extrd,u	$Zhh,59,60,$Zhh
446238384Sjkim	ldd	$nlo($Hll),$Tll
447238384Sjkim	ldd	$nlo($Hhh),$Thh
448238384Sjkim
449238384Sjkim	xor	$Tll,$Zll,$Zll
450238384Sjkim	xor	$Thh,$Zhh,$Zhh
451238384Sjkim
452238384Sjkim	depd,z	$Zll,60,4,$rem
453238384Sjkim	shrpd	$Zhh,$Zll,4,$Zll
454238384Sjkim	ldd	$rem2($rem_4bit),$rem2
455238384Sjkim
456238384Sjkim	xor	$rem2,$Zhh,$Zhh
457238384Sjkim	ldd	$nhi($Hll),$Tll
458238384Sjkim	ldd	$nhi($Hhh),$Thh
459238384Sjkim
460238384Sjkim	extrd,u	$Zhh,59,60,$Zhh
461238384Sjkim	xor	$Tll,$Zll,$Zll
462238384Sjkim	xor	$Thh,$Zhh,$Zhh
463238384Sjkim	ldd	$rem($rem_4bit),$rem
464238384Sjkim
465238384Sjkim	xor	$rem,$Zhh,$Zhh
466238384Sjkim	std	$Zll,8($Xi)
467238384Sjkim	ldo	16($inp),$inp
468238384Sjkim	std	$Zhh,0($Xi)
469238384Sjkim	cmpb,*<> $inp,$len,L\$outer_ghash_pa2
470238384Sjkim	copy	$Zll,$nlo
471238384Sjkim___
472238384Sjkim
473238384Sjkim$code.=<<___ if ($SIZE_T==4);
474238384Sjkim	b	L\$done_ghash
475238384Sjkim	nop
476238384Sjkim
477238384SjkimL\$parisc1_ghash
478238384Sjkim	ldb	15($Xi),$nlo
479238384Sjkim	ldo	12($Htbl),$Hll
480238384Sjkim	ldo	8($Htbl),$Hlh
481238384Sjkim	ldo	4($Htbl),$Hhl
482238384Sjkim
483238384SjkimL\$outer_ghash_pa1
484238384Sjkim	ldb	15($inp),$byte
485238384Sjkim	xor	$byte,$nlo,$nlo
486238384Sjkim	and	$mask0xf0,$nlo,$nhi
487238384Sjkim	zdep	$nlo,27,4,$nlo
488238384Sjkim
489238384Sjkim	ldwx	$nlo($Hll),$Zll
490238384Sjkim	ldwx	$nlo($Hlh),$Zlh
491238384Sjkim	ldwx	$nlo($Hhl),$Zhl
492238384Sjkim	ldwx	$nlo($Hhh),$Zhh
493238384Sjkim	zdep	$Zll,28,4,$rem
494238384Sjkim	ldb	14($Xi),$nlo
495238384Sjkim	ldb	14($inp),$byte
496238384Sjkim	ldwx	$rem($rem_4bit),$rem
497238384Sjkim	shrpw	$Zlh,$Zll,4,$Zll
498238384Sjkim	ldwx	$nhi($Hll),$Tll
499238384Sjkim	shrpw	$Zhl,$Zlh,4,$Zlh
500238384Sjkim	ldwx	$nhi($Hlh),$Tlh
501238384Sjkim	shrpw	$Zhh,$Zhl,4,$Zhl
502238384Sjkim	ldwx	$nhi($Hhl),$Thl
503238384Sjkim	extru	$Zhh,27,28,$Zhh
504238384Sjkim	ldwx	$nhi($Hhh),$Thh
505238384Sjkim	xor	$byte,$nlo,$nlo
506238384Sjkim	xor	$rem,$Zhh,$Zhh
507238384Sjkim	and	$mask0xf0,$nlo,$nhi
508238384Sjkim	zdep	$nlo,27,4,$nlo
509238384Sjkim
510238384Sjkim	xor	$Tll,$Zll,$Zll
511238384Sjkim	ldwx	$nlo($Hll),$Tll
512238384Sjkim	xor	$Tlh,$Zlh,$Zlh
513238384Sjkim	ldwx	$nlo($Hlh),$Tlh
514238384Sjkim	xor	$Thl,$Zhl,$Zhl
515238384Sjkim	b	L\$oop_ghash_pa1
516238384Sjkim	ldi	13,$cnt
517238384Sjkim
518238384Sjkim	.ALIGN	8
519238384SjkimL\$oop_ghash_pa1
520238384Sjkim	zdep	$Zll,28,4,$rem
521238384Sjkim	ldwx	$nlo($Hhl),$Thl
522238384Sjkim	xor	$Thh,$Zhh,$Zhh
523238384Sjkim	ldwx	$rem($rem_4bit),$rem
524238384Sjkim	shrpw	$Zlh,$Zll,4,$Zll
525238384Sjkim	ldwx	$nlo($Hhh),$Thh
526238384Sjkim	shrpw	$Zhl,$Zlh,4,$Zlh
527238384Sjkim	ldbx	$cnt($Xi),$nlo
528238384Sjkim	xor	$Tll,$Zll,$Zll
529238384Sjkim	ldwx	$nhi($Hll),$Tll
530238384Sjkim	shrpw	$Zhh,$Zhl,4,$Zhl
531238384Sjkim	ldbx	$cnt($inp),$byte
532238384Sjkim	xor	$Tlh,$Zlh,$Zlh
533238384Sjkim	ldwx	$nhi($Hlh),$Tlh
534238384Sjkim	extru	$Zhh,27,28,$Zhh
535238384Sjkim	xor	$Thl,$Zhl,$Zhl
536238384Sjkim	ldwx	$nhi($Hhl),$Thl
537238384Sjkim	xor	$rem,$Zhh,$Zhh
538238384Sjkim	zdep	$Zll,28,4,$rem
539238384Sjkim	xor	$Thh,$Zhh,$Zhh
540238384Sjkim	ldwx	$nhi($Hhh),$Thh
541238384Sjkim	shrpw	$Zlh,$Zll,4,$Zll
542238384Sjkim	ldwx	$rem($rem_4bit),$rem
543238384Sjkim	shrpw	$Zhl,$Zlh,4,$Zlh
544238384Sjkim	xor	$byte,$nlo,$nlo
545238384Sjkim	shrpw	$Zhh,$Zhl,4,$Zhl
546238384Sjkim	and	$mask0xf0,$nlo,$nhi
547238384Sjkim	extru	$Zhh,27,28,$Zhh
548238384Sjkim	zdep	$nlo,27,4,$nlo
549238384Sjkim	xor	$Tll,$Zll,$Zll
550238384Sjkim	ldwx	$nlo($Hll),$Tll
551238384Sjkim	xor	$Tlh,$Zlh,$Zlh
552238384Sjkim	ldwx	$nlo($Hlh),$Tlh
553238384Sjkim	xor	$rem,$Zhh,$Zhh
554238384Sjkim	addib,uv -1,$cnt,L\$oop_ghash_pa1
555238384Sjkim	xor	$Thl,$Zhl,$Zhl
556238384Sjkim
557238384Sjkim	zdep	$Zll,28,4,$rem
558238384Sjkim	ldwx	$nlo($Hhl),$Thl
559238384Sjkim	xor	$Thh,$Zhh,$Zhh
560238384Sjkim	ldwx	$rem($rem_4bit),$rem
561238384Sjkim	shrpw	$Zlh,$Zll,4,$Zll
562238384Sjkim	ldwx	$nlo($Hhh),$Thh
563238384Sjkim	shrpw	$Zhl,$Zlh,4,$Zlh
564238384Sjkim	xor	$Tll,$Zll,$Zll
565238384Sjkim	ldwx	$nhi($Hll),$Tll
566238384Sjkim	shrpw	$Zhh,$Zhl,4,$Zhl
567238384Sjkim	xor	$Tlh,$Zlh,$Zlh
568238384Sjkim	ldwx	$nhi($Hlh),$Tlh
569238384Sjkim	extru	$Zhh,27,28,$Zhh
570238384Sjkim	xor	$rem,$Zhh,$Zhh
571238384Sjkim	xor	$Thl,$Zhl,$Zhl
572238384Sjkim	ldwx	$nhi($Hhl),$Thl
573238384Sjkim	xor	$Thh,$Zhh,$Zhh
574238384Sjkim	ldwx	$nhi($Hhh),$Thh
575238384Sjkim	zdep	$Zll,28,4,$rem
576238384Sjkim	ldwx	$rem($rem_4bit),$rem
577238384Sjkim	shrpw	$Zlh,$Zll,4,$Zll
578238384Sjkim	shrpw	$Zhl,$Zlh,4,$Zlh
579238384Sjkim	shrpw	$Zhh,$Zhl,4,$Zhl
580238384Sjkim	extru	$Zhh,27,28,$Zhh
581238384Sjkim	xor	$Tll,$Zll,$Zll
582238384Sjkim	xor	$Tlh,$Zlh,$Zlh
583238384Sjkim	xor	$rem,$Zhh,$Zhh
584238384Sjkim	stw	$Zll,12($Xi)
585238384Sjkim	xor	$Thl,$Zhl,$Zhl
586238384Sjkim	stw	$Zlh,8($Xi)
587238384Sjkim	xor	$Thh,$Zhh,$Zhh
588238384Sjkim	stw	$Zhl,4($Xi)
589238384Sjkim	ldo	16($inp),$inp
590238384Sjkim	stw	$Zhh,0($Xi)
591238384Sjkim	comb,<>	$inp,$len,L\$outer_ghash_pa1
592238384Sjkim	copy	$Zll,$nlo
593238384Sjkim___
594238384Sjkim$code.=<<___;
595238384SjkimL\$done_ghash
596238384Sjkim	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
597238384Sjkim	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
598238384Sjkim	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
599238384Sjkim	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
600238384Sjkim___
601238384Sjkim$code.=<<___ if ($SIZE_T==4);
602238384Sjkim	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
603238384Sjkim	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
604238384Sjkim	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
605238384Sjkim	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
606238384Sjkim	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
607238384Sjkim___
608238384Sjkim$code.=<<___;
609238384Sjkim	bv	(%r2)
610238384Sjkim	.EXIT
611238384Sjkim	$POPMB	-$FRAME(%sp),%r3
612238384Sjkim	.PROCEND
613238384Sjkim
614238384Sjkim	.ALIGN	64
615238384SjkimL\$rem_4bit
616238384Sjkim	.WORD	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
617238384Sjkim	.WORD	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
618238384Sjkim	.WORD	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
619238384Sjkim	.WORD	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
620238384Sjkim	.STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
621238384Sjkim	.ALIGN	64
622238384Sjkim___
623238384Sjkim
624238384Sjkim# Explicitly encode PA-RISC 2.0 instructions used in this module, so
625238384Sjkim# that it can be compiled with .LEVEL 1.0. It should be noted that I
626238384Sjkim# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
627238384Sjkim# directive...
628238384Sjkim
629238384Sjkimmy $ldd = sub {
630238384Sjkim  my ($mod,$args) = @_;
631238384Sjkim  my $orig = "ldd$mod\t$args";
632238384Sjkim
633238384Sjkim    if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)		# format 4
634238384Sjkim    {	my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
635238384Sjkim	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
636238384Sjkim    }
637238384Sjkim    elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)	# format 5
638238384Sjkim    {	my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
639238384Sjkim	$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);		# encode offset
640238384Sjkim	$opcode|=(1<<5)  if ($mod =~ /^,m/);
641238384Sjkim	$opcode|=(1<<13) if ($mod =~ /^,mb/);
642238384Sjkim	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
643238384Sjkim    }
644238384Sjkim    else { "\t".$orig; }
645238384Sjkim};
646238384Sjkim
647238384Sjkimmy $std = sub {
648238384Sjkim  my ($mod,$args) = @_;
649238384Sjkim  my $orig = "std$mod\t$args";
650238384Sjkim
651238384Sjkim    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
652238384Sjkim    {	my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
653238384Sjkim	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
654238384Sjkim    }
655238384Sjkim    else { "\t".$orig; }
656238384Sjkim};
657238384Sjkim
658238384Sjkimmy $extrd = sub {
659238384Sjkim  my ($mod,$args) = @_;
660238384Sjkim  my $orig = "extrd$mod\t$args";
661238384Sjkim
662238384Sjkim    # I only have ",u" completer, it's implicitly encoded...
663238384Sjkim    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
664238384Sjkim    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
665238384Sjkim	my $len=32-$3;
666238384Sjkim	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
667238384Sjkim	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
668238384Sjkim	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
669238384Sjkim    }
670238384Sjkim    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
671238384Sjkim    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
672238384Sjkim	my $len=32-$2;
673238384Sjkim	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
674238384Sjkim	$opcode |= (1<<13) if ($mod =~ /,\**=/);
675238384Sjkim	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
676238384Sjkim    }
677238384Sjkim    else { "\t".$orig; }
678238384Sjkim};
679238384Sjkim
680238384Sjkimmy $shrpd = sub {
681238384Sjkim  my ($mod,$args) = @_;
682238384Sjkim  my $orig = "shrpd$mod\t$args";
683238384Sjkim
684238384Sjkim    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
685238384Sjkim    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
686238384Sjkim	my $cpos=63-$3;
687238384Sjkim	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
688238384Sjkim	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
689238384Sjkim    }
690238384Sjkim    elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)	# format 11
691238384Sjkim    {	sprintf "\t.WORD\t0x%08x\t; %s",
692238384Sjkim		(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
693238384Sjkim    }
694238384Sjkim    else { "\t".$orig; }
695238384Sjkim};
696238384Sjkim
697238384Sjkimmy $depd = sub {
698238384Sjkim  my ($mod,$args) = @_;
699238384Sjkim  my $orig = "depd$mod\t$args";
700238384Sjkim
701238384Sjkim    # I only have ",z" completer, it's impicitly encoded...
702238384Sjkim    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 16
703238384Sjkim    {	my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
704238384Sjkim    	my $cpos=63-$2;
705238384Sjkim	my $len=32-$3;
706238384Sjkim	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode pos
707238384Sjkim	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
708238384Sjkim	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
709238384Sjkim    }
710238384Sjkim    else { "\t".$orig; }
711238384Sjkim};
712238384Sjkim
713238384Sjkimsub assemble {
714238384Sjkim  my ($mnemonic,$mod,$args)=@_;
715238384Sjkim  my $opcode = eval("\$$mnemonic");
716238384Sjkim
717238384Sjkim    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
718238384Sjkim}
719238384Sjkim
720238384Sjkimforeach (split("\n",$code)) {
721238384Sjkim	s/\`([^\`]*)\`/eval $1/ge;
722238384Sjkim	if ($SIZE_T==4) {
723238384Sjkim		s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
724238384Sjkim		s/cmpb,\*/comb,/;
725238384Sjkim		s/,\*/,/;
726238384Sjkim	}
727279264Sdelphij	s/\bbv\b/bve/	if ($SIZE_T==8);
728238384Sjkim	print $_,"\n";
729238384Sjkim}
730238384Sjkim
731238384Sjkimclose STDOUT;
732