ghash-sparcv9.pl revision 306195
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# March 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Performance
15# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
16# and are expressed in cycles per processed byte, less is better:
17#
18#		gcc 3.3.x	cc 5.2		this assembler
19#
20# 32-bit build	81.4		43.3		12.6	(+546%/+244%)
21# 64-bit build	20.2		21.2		12.6	(+60%/+68%)
22#
23# Here is data collected on UltraSPARC T1 system running Linux:
24#
25#		gcc 4.4.1			this assembler
26#
27# 32-bit build	566				50	(+1000%)
28# 64-bit build	56				50	(+12%)
29#
30# I don't quite understand why difference between 32-bit and 64-bit
31# compiler-generated code is so big. Compilers *were* instructed to
32# generate code for UltraSPARC and should have used 64-bit registers
33# for Z vector (see C code) even in 32-bit build... Oh well, it only
34# means more impressive improvement coefficients for this assembler
35# module;-) Loops are aggressively modulo-scheduled in respect to
36# references to input data and Z.hi updates to achieve 12 cycles
37# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
38# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
39#
40# October 2012
41#
42# Add VIS3 lookup-table-free implementation using polynomial
43# multiplication xmulx[hi] and extended addition addxc[cc]
44# instructions. 4.52/7.63x improvement on T3/T4 or in absolute
45# terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
46# saturates at ~15.5x single-process result on 8-core processor,
47# or ~20.5GBps per 2.85GHz socket.
48
49$bits=32;
50for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
51if ($bits==64)  { $bias=2047; $frame=192; }
52else            { $bias=0;    $frame=112; }
53
54$output=shift;
55open STDOUT,">$output";
56
57$Zhi="%o0";	# 64-bit values
58$Zlo="%o1";
59$Thi="%o2";
60$Tlo="%o3";
61$rem="%o4";
62$tmp="%o5";
63
64$nhi="%l0";	# small values and pointers
65$nlo="%l1";
66$xi0="%l2";
67$xi1="%l3";
68$rem_4bit="%l4";
69$remi="%l5";
70$Htblo="%l6";
71$cnt="%l7";
72
73$Xi="%i0";	# input argument block
74$Htbl="%i1";
75$inp="%i2";
76$len="%i3";
77
78$code.=<<___ if ($bits==64);
79.register	%g2,#scratch
80.register	%g3,#scratch
81___
82$code.=<<___;
83.section	".text",#alloc,#execinstr
84
85.align	64
86rem_4bit:
87	.long	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
88	.long	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
89	.long	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
90	.long	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
91.type	rem_4bit,#object
92.size	rem_4bit,(.-rem_4bit)
93
94.globl	gcm_ghash_4bit
95.align	32
96gcm_ghash_4bit:
97	save	%sp,-$frame,%sp
98	ldub	[$inp+15],$nlo
99	ldub	[$Xi+15],$xi0
100	ldub	[$Xi+14],$xi1
101	add	$len,$inp,$len
102	add	$Htbl,8,$Htblo
103
1041:	call	.+8
105	add	%o7,rem_4bit-1b,$rem_4bit
106
107.Louter:
108	xor	$xi0,$nlo,$nlo
109	and	$nlo,0xf0,$nhi
110	and	$nlo,0x0f,$nlo
111	sll	$nlo,4,$nlo
112	ldx	[$Htblo+$nlo],$Zlo
113	ldx	[$Htbl+$nlo],$Zhi
114
115	ldub	[$inp+14],$nlo
116
117	ldx	[$Htblo+$nhi],$Tlo
118	and	$Zlo,0xf,$remi
119	ldx	[$Htbl+$nhi],$Thi
120	sll	$remi,3,$remi
121	ldx	[$rem_4bit+$remi],$rem
122	srlx	$Zlo,4,$Zlo
123	mov	13,$cnt
124	sllx	$Zhi,60,$tmp
125	xor	$Tlo,$Zlo,$Zlo
126	srlx	$Zhi,4,$Zhi
127	xor	$Zlo,$tmp,$Zlo
128
129	xor	$xi1,$nlo,$nlo
130	and	$Zlo,0xf,$remi
131	and	$nlo,0xf0,$nhi
132	and	$nlo,0x0f,$nlo
133	ba	.Lghash_inner
134	sll	$nlo,4,$nlo
135.align	32
136.Lghash_inner:
137	ldx	[$Htblo+$nlo],$Tlo
138	sll	$remi,3,$remi
139	xor	$Thi,$Zhi,$Zhi
140	ldx	[$Htbl+$nlo],$Thi
141	srlx	$Zlo,4,$Zlo
142	xor	$rem,$Zhi,$Zhi
143	ldx	[$rem_4bit+$remi],$rem
144	sllx	$Zhi,60,$tmp
145	xor	$Tlo,$Zlo,$Zlo
146	ldub	[$inp+$cnt],$nlo
147	srlx	$Zhi,4,$Zhi
148	xor	$Zlo,$tmp,$Zlo
149	ldub	[$Xi+$cnt],$xi1
150	xor	$Thi,$Zhi,$Zhi
151	and	$Zlo,0xf,$remi
152
153	ldx	[$Htblo+$nhi],$Tlo
154	sll	$remi,3,$remi
155	xor	$rem,$Zhi,$Zhi
156	ldx	[$Htbl+$nhi],$Thi
157	srlx	$Zlo,4,$Zlo
158	ldx	[$rem_4bit+$remi],$rem
159	sllx	$Zhi,60,$tmp
160	xor	$xi1,$nlo,$nlo
161	srlx	$Zhi,4,$Zhi
162	and	$nlo,0xf0,$nhi
163	addcc	$cnt,-1,$cnt
164	xor	$Zlo,$tmp,$Zlo
165	and	$nlo,0x0f,$nlo
166	xor	$Tlo,$Zlo,$Zlo
167	sll	$nlo,4,$nlo
168	blu	.Lghash_inner
169	and	$Zlo,0xf,$remi
170
171	ldx	[$Htblo+$nlo],$Tlo
172	sll	$remi,3,$remi
173	xor	$Thi,$Zhi,$Zhi
174	ldx	[$Htbl+$nlo],$Thi
175	srlx	$Zlo,4,$Zlo
176	xor	$rem,$Zhi,$Zhi
177	ldx	[$rem_4bit+$remi],$rem
178	sllx	$Zhi,60,$tmp
179	xor	$Tlo,$Zlo,$Zlo
180	srlx	$Zhi,4,$Zhi
181	xor	$Zlo,$tmp,$Zlo
182	xor	$Thi,$Zhi,$Zhi
183
184	add	$inp,16,$inp
185	cmp	$inp,$len
186	be,pn	`$bits==64?"%xcc":"%icc"`,.Ldone
187	and	$Zlo,0xf,$remi
188
189	ldx	[$Htblo+$nhi],$Tlo
190	sll	$remi,3,$remi
191	xor	$rem,$Zhi,$Zhi
192	ldx	[$Htbl+$nhi],$Thi
193	srlx	$Zlo,4,$Zlo
194	ldx	[$rem_4bit+$remi],$rem
195	sllx	$Zhi,60,$tmp
196	xor	$Tlo,$Zlo,$Zlo
197	ldub	[$inp+15],$nlo
198	srlx	$Zhi,4,$Zhi
199	xor	$Zlo,$tmp,$Zlo
200	xor	$Thi,$Zhi,$Zhi
201	stx	$Zlo,[$Xi+8]
202	xor	$rem,$Zhi,$Zhi
203	stx	$Zhi,[$Xi]
204	srl	$Zlo,8,$xi1
205	and	$Zlo,0xff,$xi0
206	ba	.Louter
207	and	$xi1,0xff,$xi1
208.align	32
209.Ldone:
210	ldx	[$Htblo+$nhi],$Tlo
211	sll	$remi,3,$remi
212	xor	$rem,$Zhi,$Zhi
213	ldx	[$Htbl+$nhi],$Thi
214	srlx	$Zlo,4,$Zlo
215	ldx	[$rem_4bit+$remi],$rem
216	sllx	$Zhi,60,$tmp
217	xor	$Tlo,$Zlo,$Zlo
218	srlx	$Zhi,4,$Zhi
219	xor	$Zlo,$tmp,$Zlo
220	xor	$Thi,$Zhi,$Zhi
221	stx	$Zlo,[$Xi+8]
222	xor	$rem,$Zhi,$Zhi
223	stx	$Zhi,[$Xi]
224
225	ret
226	restore
227.type	gcm_ghash_4bit,#function
228.size	gcm_ghash_4bit,(.-gcm_ghash_4bit)
229___
230
231undef $inp;
232undef $len;
233
234$code.=<<___;
235.globl	gcm_gmult_4bit
236.align	32
237gcm_gmult_4bit:
238	save	%sp,-$frame,%sp
239	ldub	[$Xi+15],$nlo
240	add	$Htbl,8,$Htblo
241
2421:	call	.+8
243	add	%o7,rem_4bit-1b,$rem_4bit
244
245	and	$nlo,0xf0,$nhi
246	and	$nlo,0x0f,$nlo
247	sll	$nlo,4,$nlo
248	ldx	[$Htblo+$nlo],$Zlo
249	ldx	[$Htbl+$nlo],$Zhi
250
251	ldub	[$Xi+14],$nlo
252
253	ldx	[$Htblo+$nhi],$Tlo
254	and	$Zlo,0xf,$remi
255	ldx	[$Htbl+$nhi],$Thi
256	sll	$remi,3,$remi
257	ldx	[$rem_4bit+$remi],$rem
258	srlx	$Zlo,4,$Zlo
259	mov	13,$cnt
260	sllx	$Zhi,60,$tmp
261	xor	$Tlo,$Zlo,$Zlo
262	srlx	$Zhi,4,$Zhi
263	xor	$Zlo,$tmp,$Zlo
264
265	and	$Zlo,0xf,$remi
266	and	$nlo,0xf0,$nhi
267	and	$nlo,0x0f,$nlo
268	ba	.Lgmult_inner
269	sll	$nlo,4,$nlo
270.align	32
271.Lgmult_inner:
272	ldx	[$Htblo+$nlo],$Tlo
273	sll	$remi,3,$remi
274	xor	$Thi,$Zhi,$Zhi
275	ldx	[$Htbl+$nlo],$Thi
276	srlx	$Zlo,4,$Zlo
277	xor	$rem,$Zhi,$Zhi
278	ldx	[$rem_4bit+$remi],$rem
279	sllx	$Zhi,60,$tmp
280	xor	$Tlo,$Zlo,$Zlo
281	ldub	[$Xi+$cnt],$nlo
282	srlx	$Zhi,4,$Zhi
283	xor	$Zlo,$tmp,$Zlo
284	xor	$Thi,$Zhi,$Zhi
285	and	$Zlo,0xf,$remi
286
287	ldx	[$Htblo+$nhi],$Tlo
288	sll	$remi,3,$remi
289	xor	$rem,$Zhi,$Zhi
290	ldx	[$Htbl+$nhi],$Thi
291	srlx	$Zlo,4,$Zlo
292	ldx	[$rem_4bit+$remi],$rem
293	sllx	$Zhi,60,$tmp
294	srlx	$Zhi,4,$Zhi
295	and	$nlo,0xf0,$nhi
296	addcc	$cnt,-1,$cnt
297	xor	$Zlo,$tmp,$Zlo
298	and	$nlo,0x0f,$nlo
299	xor	$Tlo,$Zlo,$Zlo
300	sll	$nlo,4,$nlo
301	blu	.Lgmult_inner
302	and	$Zlo,0xf,$remi
303
304	ldx	[$Htblo+$nlo],$Tlo
305	sll	$remi,3,$remi
306	xor	$Thi,$Zhi,$Zhi
307	ldx	[$Htbl+$nlo],$Thi
308	srlx	$Zlo,4,$Zlo
309	xor	$rem,$Zhi,$Zhi
310	ldx	[$rem_4bit+$remi],$rem
311	sllx	$Zhi,60,$tmp
312	xor	$Tlo,$Zlo,$Zlo
313	srlx	$Zhi,4,$Zhi
314	xor	$Zlo,$tmp,$Zlo
315	xor	$Thi,$Zhi,$Zhi
316	and	$Zlo,0xf,$remi
317
318	ldx	[$Htblo+$nhi],$Tlo
319	sll	$remi,3,$remi
320	xor	$rem,$Zhi,$Zhi
321	ldx	[$Htbl+$nhi],$Thi
322	srlx	$Zlo,4,$Zlo
323	ldx	[$rem_4bit+$remi],$rem
324	sllx	$Zhi,60,$tmp
325	xor	$Tlo,$Zlo,$Zlo
326	srlx	$Zhi,4,$Zhi
327	xor	$Zlo,$tmp,$Zlo
328	xor	$Thi,$Zhi,$Zhi
329	stx	$Zlo,[$Xi+8]
330	xor	$rem,$Zhi,$Zhi
331	stx	$Zhi,[$Xi]
332
333	ret
334	restore
335.type	gcm_gmult_4bit,#function
336.size	gcm_gmult_4bit,(.-gcm_gmult_4bit)
337___
338
339{{{
340# Straightforward 128x128-bit multiplication using Karatsuba algorithm
341# followed by pair of 64-bit reductions [with a shortcut in first one,
342# which allowed to break dependency between reductions and remove one
343# multiplication from critical path]. While it might be suboptimal
344# with regard to sheer number of multiplications, other methods [such
345# as aggregate reduction] would require more 64-bit registers, which
346# we don't have in 32-bit application context.
347
348($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
349
350($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
351	(map("%o$_",(0..5,7)),map("%g$_",(1..5)));
352
353($shl,$shr)=map("%l$_",(0..7));
354
355# For details regarding "twisted H" see ghash-x86.pl.
356$code.=<<___;
357.globl	gcm_init_vis3
358.align	32
359gcm_init_vis3:
360	save	%sp,-$frame,%sp
361
362	ldx	[%i1+0],$Hhi
363	ldx	[%i1+8],$Hlo
364	mov	0xE1,$Xhi
365	mov	1,$Xlo
366	sllx	$Xhi,57,$Xhi
367	srax	$Hhi,63,$C0		! broadcast carry
368	addcc	$Hlo,$Hlo,$Hlo		! H<<=1
369	addxc	$Hhi,$Hhi,$Hhi
370	and	$C0,$Xlo,$Xlo
371	and	$C0,$Xhi,$Xhi
372	xor	$Xlo,$Hlo,$Hlo
373	xor	$Xhi,$Hhi,$Hhi
374	stx	$Hlo,[%i0+8]		! save twisted H
375	stx	$Hhi,[%i0+0]
376
377	sethi	%hi(0xA0406080),$V
378	sethi	%hi(0x20C0E000),%l0
379	or	$V,%lo(0xA0406080),$V
380	or	%l0,%lo(0x20C0E000),%l0
381	sllx	$V,32,$V
382	or	%l0,$V,$V		! (0xE0��i)&0xff=0xA040608020C0E000
383	stx	$V,[%i0+16]
384
385	ret
386	restore
387.type	gcm_init_vis3,#function
388.size	gcm_init_vis3,.-gcm_init_vis3
389
390.globl	gcm_gmult_vis3
391.align	32
392gcm_gmult_vis3:
393	save	%sp,-$frame,%sp
394
395	ldx	[$Xip+8],$Xlo		! load Xi
396	ldx	[$Xip+0],$Xhi
397	ldx	[$Htable+8],$Hlo	! load twisted H
398	ldx	[$Htable+0],$Hhi
399
400	mov	0xE1,%l7
401	sllx	%l7,57,$xE1		! 57 is not a typo
402	ldx	[$Htable+16],$V		! (0xE0��i)&0xff=0xA040608020C0E000
403
404	xor	$Hhi,$Hlo,$Hhl		! Karatsuba pre-processing
405	xmulx	$Xlo,$Hlo,$C0
406	xor	$Xlo,$Xhi,$C2		! Karatsuba pre-processing
407	xmulx	$C2,$Hhl,$C1
408	xmulxhi	$Xlo,$Hlo,$Xlo
409	xmulxhi	$C2,$Hhl,$C2
410	xmulxhi	$Xhi,$Hhi,$C3
411	xmulx	$Xhi,$Hhi,$Xhi
412
413	sll	$C0,3,$sqr
414	srlx	$V,$sqr,$sqr		! ��0xE0 [implicit &(7<<3)]
415	xor	$C0,$sqr,$sqr
416	sllx	$sqr,57,$sqr		! ($C0��0xE1)<<1<<56 [implicit &0x7f]
417
418	xor	$C0,$C1,$C1		! Karatsuba post-processing
419	xor	$Xlo,$C2,$C2
420	 xor	$sqr,$Xlo,$Xlo		! real destination is $C1
421	xor	$C3,$C2,$C2
422	xor	$Xlo,$C1,$C1
423	xor	$Xhi,$C2,$C2
424	xor	$Xhi,$C1,$C1
425
426	xmulxhi	$C0,$xE1,$Xlo		! ��0xE1<<1<<56
427	 xor	$C0,$C2,$C2
428	xmulx	$C1,$xE1,$C0
429	 xor	$C1,$C3,$C3
430	xmulxhi	$C1,$xE1,$C1
431
432	xor	$Xlo,$C2,$C2
433	xor	$C0,$C2,$C2
434	xor	$C1,$C3,$C3
435
436	stx	$C2,[$Xip+8]		! save Xi
437	stx	$C3,[$Xip+0]
438
439	ret
440	restore
441.type	gcm_gmult_vis3,#function
442.size	gcm_gmult_vis3,.-gcm_gmult_vis3
443
444.globl	gcm_ghash_vis3
445.align	32
446gcm_ghash_vis3:
447	save	%sp,-$frame,%sp
448	nop
449	srln	$len,0,$len		! needed on v8+, "nop" on v9
450
451	ldx	[$Xip+8],$C2		! load Xi
452	ldx	[$Xip+0],$C3
453	ldx	[$Htable+8],$Hlo	! load twisted H
454	ldx	[$Htable+0],$Hhi
455
456	mov	0xE1,%l7
457	sllx	%l7,57,$xE1		! 57 is not a typo
458	ldx	[$Htable+16],$V		! (0xE0��i)&0xff=0xA040608020C0E000
459
460	and	$inp,7,$shl
461	andn	$inp,7,$inp
462	sll	$shl,3,$shl
463	prefetch [$inp+63], 20
464	sub	%g0,$shl,$shr
465
466	xor	$Hhi,$Hlo,$Hhl		! Karatsuba pre-processing
467.Loop:
468	ldx	[$inp+8],$Xlo
469	brz,pt	$shl,1f
470	ldx	[$inp+0],$Xhi
471
472	ldx	[$inp+16],$C1		! align data
473	srlx	$Xlo,$shr,$C0
474	sllx	$Xlo,$shl,$Xlo
475	sllx	$Xhi,$shl,$Xhi
476	srlx	$C1,$shr,$C1
477	or	$C0,$Xhi,$Xhi
478	or	$C1,$Xlo,$Xlo
4791:
480	add	$inp,16,$inp
481	sub	$len,16,$len
482	xor	$C2,$Xlo,$Xlo
483	xor	$C3,$Xhi,$Xhi
484	prefetch [$inp+63], 20
485
486	xmulx	$Xlo,$Hlo,$C0
487	xor	$Xlo,$Xhi,$C2		! Karatsuba pre-processing
488	xmulx	$C2,$Hhl,$C1
489	xmulxhi	$Xlo,$Hlo,$Xlo
490	xmulxhi	$C2,$Hhl,$C2
491	xmulxhi	$Xhi,$Hhi,$C3
492	xmulx	$Xhi,$Hhi,$Xhi
493
494	sll	$C0,3,$sqr
495	srlx	$V,$sqr,$sqr		! ��0xE0 [implicit &(7<<3)]
496	xor	$C0,$sqr,$sqr
497	sllx	$sqr,57,$sqr		! ($C0��0xE1)<<1<<56 [implicit &0x7f]
498
499	xor	$C0,$C1,$C1		! Karatsuba post-processing
500	xor	$Xlo,$C2,$C2
501	 xor	$sqr,$Xlo,$Xlo		! real destination is $C1
502	xor	$C3,$C2,$C2
503	xor	$Xlo,$C1,$C1
504	xor	$Xhi,$C2,$C2
505	xor	$Xhi,$C1,$C1
506
507	xmulxhi	$C0,$xE1,$Xlo		! ��0xE1<<1<<56
508	 xor	$C0,$C2,$C2
509	xmulx	$C1,$xE1,$C0
510	 xor	$C1,$C3,$C3
511	xmulxhi	$C1,$xE1,$C1
512
513	xor	$Xlo,$C2,$C2
514	xor	$C0,$C2,$C2
515	brnz,pt	$len,.Loop
516	xor	$C1,$C3,$C3
517
518	stx	$C2,[$Xip+8]		! save Xi
519	stx	$C3,[$Xip+0]
520
521	ret
522	restore
523.type	gcm_ghash_vis3,#function
524.size	gcm_ghash_vis3,.-gcm_ghash_vis3
525___
526}}}
527$code.=<<___;
528.asciz	"GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
529.align	4
530___
531
532
533# Purpose of these subroutines is to explicitly encode VIS instructions,
534# so that one can compile the module without having to specify VIS
535# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
536# Idea is to reserve for option to produce "universal" binary and let
537# programmer detect if current CPU is VIS capable at run-time.
538sub unvis3 {
539my ($mnemonic,$rs1,$rs2,$rd)=@_;
540my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
541my ($ref,$opf);
542my %visopf = (	"addxc"		=> 0x011,
543		"addxccc"	=> 0x013,
544		"xmulx"		=> 0x115,
545		"xmulxhi"	=> 0x116	);
546
547    $ref = "$mnemonic\t$rs1,$rs2,$rd";
548
549    if ($opf=$visopf{$mnemonic}) {
550	foreach ($rs1,$rs2,$rd) {
551	    return $ref if (!/%([goli])([0-9])/);
552	    $_=$bias{$1}+$2;
553	}
554
555	return	sprintf ".word\t0x%08x !%s",
556			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
557			$ref;
558    } else {
559	return $ref;
560    }
561}
562
563foreach (split("\n",$code)) {
564	s/\`([^\`]*)\`/eval $1/ge;
565
566	s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
567		&unvis3($1,$2,$3,$4)
568	 /ge;
569
570	print $_,"\n";
571}
572
573close STDOUT;
574