ghash-alpha.pl revision 1.2
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# March 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Even though
15# loops are aggressively modulo-scheduled in respect to references to
16# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
17# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
18# scheduling "glitch," because uprofile(1) indicates uniform sample
19# distribution, as if all instruction bundles execute in 1.5 cycles.
20# Meaning that it could have been even faster, yet 12 cycles is ~60%
21# better than gcc-generated code and ~80% than code generated by vendor
22# compiler.
23
24$cnt="v0";	# $0
25$t0="t0";
26$t1="t1";
27$t2="t2";
28$Thi0="t3";	# $4
29$Tlo0="t4";
30$Thi1="t5";
31$Tlo1="t6";
32$rem="t7";	# $8
33#################
34$Xi="a0";	# $16, input argument block
35$Htbl="a1";
36$inp="a2";
37$len="a3";
38$nlo="a4";	# $20
39$nhi="a5";
40$Zhi="t8";
41$Zlo="t9";
42$Xhi="t10";	# $24
43$Xlo="t11";
44$remp="t12";
45$rem_4bit="AT";	# $28
46
47{ my $N;
48  sub loop() {
49
50	$N++;
51$code.=<<___;
52.align	4
53	extbl	$Xlo,7,$nlo
54	and	$nlo,0xf0,$nhi
55	sll	$nlo,4,$nlo
56	and	$nlo,0xf0,$nlo
57
58	addq	$nlo,$Htbl,$nlo
59	ldq	$Zlo,8($nlo)
60	addq	$nhi,$Htbl,$nhi
61	ldq	$Zhi,0($nlo)
62
63	and	$Zlo,0x0f,$remp
64	sll	$Zhi,60,$t0
65	lda	$cnt,6(zero)
66	extbl	$Xlo,6,$nlo
67
68	ldq	$Tlo1,8($nhi)
69	s8addq	$remp,$rem_4bit,$remp
70	ldq	$Thi1,0($nhi)
71	srl	$Zlo,4,$Zlo
72
73	ldq	$rem,0($remp)
74	srl	$Zhi,4,$Zhi
75	xor	$t0,$Zlo,$Zlo
76	and	$nlo,0xf0,$nhi
77
78	xor	$Tlo1,$Zlo,$Zlo
79	sll	$nlo,4,$nlo
80	xor	$Thi1,$Zhi,$Zhi
81	and	$nlo,0xf0,$nlo
82
83	addq	$nlo,$Htbl,$nlo
84	ldq	$Tlo0,8($nlo)
85	addq	$nhi,$Htbl,$nhi
86	ldq	$Thi0,0($nlo)
87
88.Looplo$N:
89	and	$Zlo,0x0f,$remp
90	sll	$Zhi,60,$t0
91	subq	$cnt,1,$cnt
92	srl	$Zlo,4,$Zlo
93
94	ldq	$Tlo1,8($nhi)
95	xor	$rem,$Zhi,$Zhi
96	ldq	$Thi1,0($nhi)
97	s8addq	$remp,$rem_4bit,$remp
98
99	ldq	$rem,0($remp)
100	srl	$Zhi,4,$Zhi
101	xor	$t0,$Zlo,$Zlo
102	extbl	$Xlo,$cnt,$nlo
103
104	and	$nlo,0xf0,$nhi
105	xor	$Thi0,$Zhi,$Zhi
106	xor	$Tlo0,$Zlo,$Zlo
107	sll	$nlo,4,$nlo
108
109
110	and	$Zlo,0x0f,$remp
111	sll	$Zhi,60,$t0
112	and	$nlo,0xf0,$nlo
113	srl	$Zlo,4,$Zlo
114
115	s8addq	$remp,$rem_4bit,$remp
116	xor	$rem,$Zhi,$Zhi
117	addq	$nlo,$Htbl,$nlo
118	addq	$nhi,$Htbl,$nhi
119
120	ldq	$rem,0($remp)
121	srl	$Zhi,4,$Zhi
122	ldq	$Tlo0,8($nlo)
123	xor	$t0,$Zlo,$Zlo
124
125	xor	$Tlo1,$Zlo,$Zlo
126	xor	$Thi1,$Zhi,$Zhi
127	ldq	$Thi0,0($nlo)
128	bne	$cnt,.Looplo$N
129
130
131	and	$Zlo,0x0f,$remp
132	sll	$Zhi,60,$t0
133	lda	$cnt,7(zero)
134	srl	$Zlo,4,$Zlo
135
136	ldq	$Tlo1,8($nhi)
137	xor	$rem,$Zhi,$Zhi
138	ldq	$Thi1,0($nhi)
139	s8addq	$remp,$rem_4bit,$remp
140
141	ldq	$rem,0($remp)
142	srl	$Zhi,4,$Zhi
143	xor	$t0,$Zlo,$Zlo
144	extbl	$Xhi,$cnt,$nlo
145
146	and	$nlo,0xf0,$nhi
147	xor	$Thi0,$Zhi,$Zhi
148	xor	$Tlo0,$Zlo,$Zlo
149	sll	$nlo,4,$nlo
150
151	and	$Zlo,0x0f,$remp
152	sll	$Zhi,60,$t0
153	and	$nlo,0xf0,$nlo
154	srl	$Zlo,4,$Zlo
155
156	s8addq	$remp,$rem_4bit,$remp
157	xor	$rem,$Zhi,$Zhi
158	addq	$nlo,$Htbl,$nlo
159	addq	$nhi,$Htbl,$nhi
160
161	ldq	$rem,0($remp)
162	srl	$Zhi,4,$Zhi
163	ldq	$Tlo0,8($nlo)
164	xor	$t0,$Zlo,$Zlo
165
166	xor	$Tlo1,$Zlo,$Zlo
167	xor	$Thi1,$Zhi,$Zhi
168	ldq	$Thi0,0($nlo)
169	unop
170
171
172.Loophi$N:
173	and	$Zlo,0x0f,$remp
174	sll	$Zhi,60,$t0
175	subq	$cnt,1,$cnt
176	srl	$Zlo,4,$Zlo
177
178	ldq	$Tlo1,8($nhi)
179	xor	$rem,$Zhi,$Zhi
180	ldq	$Thi1,0($nhi)
181	s8addq	$remp,$rem_4bit,$remp
182
183	ldq	$rem,0($remp)
184	srl	$Zhi,4,$Zhi
185	xor	$t0,$Zlo,$Zlo
186	extbl	$Xhi,$cnt,$nlo
187
188	and	$nlo,0xf0,$nhi
189	xor	$Thi0,$Zhi,$Zhi
190	xor	$Tlo0,$Zlo,$Zlo
191	sll	$nlo,4,$nlo
192
193
194	and	$Zlo,0x0f,$remp
195	sll	$Zhi,60,$t0
196	and	$nlo,0xf0,$nlo
197	srl	$Zlo,4,$Zlo
198
199	s8addq	$remp,$rem_4bit,$remp
200	xor	$rem,$Zhi,$Zhi
201	addq	$nlo,$Htbl,$nlo
202	addq	$nhi,$Htbl,$nhi
203
204	ldq	$rem,0($remp)
205	srl	$Zhi,4,$Zhi
206	ldq	$Tlo0,8($nlo)
207	xor	$t0,$Zlo,$Zlo
208
209	xor	$Tlo1,$Zlo,$Zlo
210	xor	$Thi1,$Zhi,$Zhi
211	ldq	$Thi0,0($nlo)
212	bne	$cnt,.Loophi$N
213
214
215	and	$Zlo,0x0f,$remp
216	sll	$Zhi,60,$t0
217	srl	$Zlo,4,$Zlo
218
219	ldq	$Tlo1,8($nhi)
220	xor	$rem,$Zhi,$Zhi
221	ldq	$Thi1,0($nhi)
222	s8addq	$remp,$rem_4bit,$remp
223
224	ldq	$rem,0($remp)
225	srl	$Zhi,4,$Zhi
226	xor	$t0,$Zlo,$Zlo
227
228	xor	$Tlo0,$Zlo,$Zlo
229	xor	$Thi0,$Zhi,$Zhi
230
231	and	$Zlo,0x0f,$remp
232	sll	$Zhi,60,$t0
233	srl	$Zlo,4,$Zlo
234
235	s8addq	$remp,$rem_4bit,$remp
236	xor	$rem,$Zhi,$Zhi
237
238	ldq	$rem,0($remp)
239	srl	$Zhi,4,$Zhi
240	xor	$Tlo1,$Zlo,$Zlo
241	xor	$Thi1,$Zhi,$Zhi
242	xor	$t0,$Zlo,$Zlo
243	xor	$rem,$Zhi,$Zhi
244___
245}}
246
247$code=<<___;
248#include <machine/asm.h>
249
250.text
251
252.set	noat
253.set	noreorder
254.globl	gcm_gmult_4bit
255.align	4
256.ent	gcm_gmult_4bit
257gcm_gmult_4bit:
258	.frame	sp,0,ra
259	.prologue 0
260
261	ldq	$Xlo,8($Xi)
262	ldq	$Xhi,0($Xi)
263
264	bsr	$t0,picmeup
265	nop
266___
267
268	&loop();
269
270$code.=<<___;
271	srl	$Zlo,24,$t0	# byte swap
272	srl	$Zlo,8,$t1
273
274	sll	$Zlo,8,$t2
275	sll	$Zlo,24,$Zlo
276	zapnot	$t0,0x11,$t0
277	zapnot	$t1,0x22,$t1
278
279	zapnot	$Zlo,0x88,$Zlo
280	or	$t0,$t1,$t0
281	zapnot	$t2,0x44,$t2
282
283	or	$Zlo,$t0,$Zlo
284	srl	$Zhi,24,$t0
285	srl	$Zhi,8,$t1
286
287	or	$Zlo,$t2,$Zlo
288	sll	$Zhi,8,$t2
289	sll	$Zhi,24,$Zhi
290
291	srl	$Zlo,32,$Xlo
292	sll	$Zlo,32,$Zlo
293
294	zapnot	$t0,0x11,$t0
295	zapnot	$t1,0x22,$t1
296	or	$Zlo,$Xlo,$Xlo
297
298	zapnot	$Zhi,0x88,$Zhi
299	or	$t0,$t1,$t0
300	zapnot	$t2,0x44,$t2
301
302	or	$Zhi,$t0,$Zhi
303	or	$Zhi,$t2,$Zhi
304
305	srl	$Zhi,32,$Xhi
306	sll	$Zhi,32,$Zhi
307
308	or	$Zhi,$Xhi,$Xhi
309	stq	$Xlo,8($Xi)
310	stq	$Xhi,0($Xi)
311
312	ret	(ra)
313.end	gcm_gmult_4bit
314___
315
316$inhi="s0";
317$inlo="s1";
318
319$code.=<<___;
320.globl	gcm_ghash_4bit
321.align	4
322.ent	gcm_ghash_4bit
323gcm_ghash_4bit:
324	lda	sp,-32(sp)
325	stq	ra,0(sp)
326	stq	s0,8(sp)
327	stq	s1,16(sp)
328	.mask	0x04000600,-32
329	.frame	sp,32,ra
330	.prologue 0
331
332	ldq_u	$inhi,0($inp)
333	ldq_u	$Thi0,7($inp)
334	ldq_u	$inlo,8($inp)
335	ldq_u	$Tlo0,15($inp)
336	ldq	$Xhi,0($Xi)
337	ldq	$Xlo,8($Xi)
338
339	bsr	$t0,picmeup
340	nop
341
342.Louter:
343	extql	$inhi,$inp,$inhi
344	extqh	$Thi0,$inp,$Thi0
345	or	$inhi,$Thi0,$inhi
346	lda	$inp,16($inp)
347
348	extql	$inlo,$inp,$inlo
349	extqh	$Tlo0,$inp,$Tlo0
350	or	$inlo,$Tlo0,$inlo
351	subq	$len,16,$len
352
353	xor	$Xlo,$inlo,$Xlo
354	xor	$Xhi,$inhi,$Xhi
355___
356
357	&loop();
358
359$code.=<<___;
360	srl	$Zlo,24,$t0	# byte swap
361	srl	$Zlo,8,$t1
362
363	sll	$Zlo,8,$t2
364	sll	$Zlo,24,$Zlo
365	zapnot	$t0,0x11,$t0
366	zapnot	$t1,0x22,$t1
367
368	zapnot	$Zlo,0x88,$Zlo
369	or	$t0,$t1,$t0
370	zapnot	$t2,0x44,$t2
371
372	or	$Zlo,$t0,$Zlo
373	srl	$Zhi,24,$t0
374	srl	$Zhi,8,$t1
375
376	or	$Zlo,$t2,$Zlo
377	sll	$Zhi,8,$t2
378	sll	$Zhi,24,$Zhi
379
380	srl	$Zlo,32,$Xlo
381	sll	$Zlo,32,$Zlo
382	beq	$len,.Ldone
383
384	zapnot	$t0,0x11,$t0
385	zapnot	$t1,0x22,$t1
386	or	$Zlo,$Xlo,$Xlo
387	ldq_u	$inhi,0($inp)
388
389	zapnot	$Zhi,0x88,$Zhi
390	or	$t0,$t1,$t0
391	zapnot	$t2,0x44,$t2
392	ldq_u	$Thi0,7($inp)
393
394	or	$Zhi,$t0,$Zhi
395	or	$Zhi,$t2,$Zhi
396	ldq_u	$inlo,8($inp)
397	ldq_u	$Tlo0,15($inp)
398
399	srl	$Zhi,32,$Xhi
400	sll	$Zhi,32,$Zhi
401
402	or	$Zhi,$Xhi,$Xhi
403	br	zero,.Louter
404
405.Ldone:
406	zapnot	$t0,0x11,$t0
407	zapnot	$t1,0x22,$t1
408	or	$Zlo,$Xlo,$Xlo
409
410	zapnot	$Zhi,0x88,$Zhi
411	or	$t0,$t1,$t0
412	zapnot	$t2,0x44,$t2
413
414	or	$Zhi,$t0,$Zhi
415	or	$Zhi,$t2,$Zhi
416
417	srl	$Zhi,32,$Xhi
418	sll	$Zhi,32,$Zhi
419
420	or	$Zhi,$Xhi,$Xhi
421
422	stq	$Xlo,8($Xi)
423	stq	$Xhi,0($Xi)
424
425	.set	noreorder
426	/*ldq	ra,0(sp)*/
427	ldq	s0,8(sp)
428	ldq	s1,16(sp)
429	lda	sp,32(sp)
430	ret	(ra)
431.end	gcm_ghash_4bit
432
433.align	4
434.ent	picmeup
435picmeup:
436	.frame	sp,0,$t0
437	.prologue 0
438	br	$rem_4bit,.Lpic
439.Lpic:	lda	$rem_4bit,12($rem_4bit)
440	ret	($t0)
441.end	picmeup
442	nop
443rem_4bit:
444	.long	0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
445	.long	0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
446	.long	0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
447	.long	0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
448.ascii	"GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
449.align	4
450
451___
452$output=shift and open STDOUT,">$output";
453print $code;
454close STDOUT;
455
456