ghash-alpha.pl revision 1.3
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# March 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Even though
15# loops are aggressively modulo-scheduled in respect to references to
16# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
17# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
18# scheduling "glitch," because uprofile(1) indicates uniform sample
19# distribution, as if all instruction bundles execute in 1.5 cycles.
20# Meaning that it could have been even faster, yet 12 cycles is ~60%
21# better than gcc-generated code and ~80% than code generated by vendor
22# compiler.
23
24$cnt="v0";	# $0
25$t0="t0";
26$t1="t1";
27$t2="t2";
28$Thi0="t3";	# $4
29$Tlo0="t4";
30$Thi1="t5";
31$Tlo1="t6";
32$rem="t7";	# $8
33#################
34$Xi="a0";	# $16, input argument block
35$Htbl="a1";
36$inp="a2";
37$len="a3";
38$nlo="a4";	# $20
39$nhi="a5";
40$Zhi="t8";
41$Zlo="t9";
42$Xhi="t10";	# $24
43$Xlo="t11";
44$remp="t12";
45$rem_4bit="AT";	# $28
46
47{ my $N;
48  sub loop() {
49
50	$N++;
51$code.=<<___;
52.align	4
53	extbl	$Xlo,7,$nlo
54	and	$nlo,0xf0,$nhi
55	sll	$nlo,4,$nlo
56	and	$nlo,0xf0,$nlo
57
58	addq	$nlo,$Htbl,$nlo
59	ldq	$Zlo,8($nlo)
60	addq	$nhi,$Htbl,$nhi
61	ldq	$Zhi,0($nlo)
62
63	and	$Zlo,0x0f,$remp
64	sll	$Zhi,60,$t0
65	lda	$cnt,6(zero)
66	extbl	$Xlo,6,$nlo
67
68	ldq	$Tlo1,8($nhi)
69	s8addq	$remp,$rem_4bit,$remp
70	ldq	$Thi1,0($nhi)
71	srl	$Zlo,4,$Zlo
72
73	ldq	$rem,0($remp)
74	srl	$Zhi,4,$Zhi
75	xor	$t0,$Zlo,$Zlo
76	and	$nlo,0xf0,$nhi
77
78	xor	$Tlo1,$Zlo,$Zlo
79	sll	$nlo,4,$nlo
80	xor	$Thi1,$Zhi,$Zhi
81	and	$nlo,0xf0,$nlo
82
83	addq	$nlo,$Htbl,$nlo
84	ldq	$Tlo0,8($nlo)
85	addq	$nhi,$Htbl,$nhi
86	ldq	$Thi0,0($nlo)
87
88.Looplo$N:
89	and	$Zlo,0x0f,$remp
90	sll	$Zhi,60,$t0
91	subq	$cnt,1,$cnt
92	srl	$Zlo,4,$Zlo
93
94	ldq	$Tlo1,8($nhi)
95	xor	$rem,$Zhi,$Zhi
96	ldq	$Thi1,0($nhi)
97	s8addq	$remp,$rem_4bit,$remp
98
99	ldq	$rem,0($remp)
100	srl	$Zhi,4,$Zhi
101	xor	$t0,$Zlo,$Zlo
102	extbl	$Xlo,$cnt,$nlo
103
104	and	$nlo,0xf0,$nhi
105	xor	$Thi0,$Zhi,$Zhi
106	xor	$Tlo0,$Zlo,$Zlo
107	sll	$nlo,4,$nlo
108
109
110	and	$Zlo,0x0f,$remp
111	sll	$Zhi,60,$t0
112	and	$nlo,0xf0,$nlo
113	srl	$Zlo,4,$Zlo
114
115	s8addq	$remp,$rem_4bit,$remp
116	xor	$rem,$Zhi,$Zhi
117	addq	$nlo,$Htbl,$nlo
118	addq	$nhi,$Htbl,$nhi
119
120	ldq	$rem,0($remp)
121	srl	$Zhi,4,$Zhi
122	ldq	$Tlo0,8($nlo)
123	xor	$t0,$Zlo,$Zlo
124
125	xor	$Tlo1,$Zlo,$Zlo
126	xor	$Thi1,$Zhi,$Zhi
127	ldq	$Thi0,0($nlo)
128	bne	$cnt,.Looplo$N
129
130
131	and	$Zlo,0x0f,$remp
132	sll	$Zhi,60,$t0
133	lda	$cnt,7(zero)
134	srl	$Zlo,4,$Zlo
135
136	ldq	$Tlo1,8($nhi)
137	xor	$rem,$Zhi,$Zhi
138	ldq	$Thi1,0($nhi)
139	s8addq	$remp,$rem_4bit,$remp
140
141	ldq	$rem,0($remp)
142	srl	$Zhi,4,$Zhi
143	xor	$t0,$Zlo,$Zlo
144	extbl	$Xhi,$cnt,$nlo
145
146	and	$nlo,0xf0,$nhi
147	xor	$Thi0,$Zhi,$Zhi
148	xor	$Tlo0,$Zlo,$Zlo
149	sll	$nlo,4,$nlo
150
151	and	$Zlo,0x0f,$remp
152	sll	$Zhi,60,$t0
153	and	$nlo,0xf0,$nlo
154	srl	$Zlo,4,$Zlo
155
156	s8addq	$remp,$rem_4bit,$remp
157	xor	$rem,$Zhi,$Zhi
158	addq	$nlo,$Htbl,$nlo
159	addq	$nhi,$Htbl,$nhi
160
161	ldq	$rem,0($remp)
162	srl	$Zhi,4,$Zhi
163	ldq	$Tlo0,8($nlo)
164	xor	$t0,$Zlo,$Zlo
165
166	xor	$Tlo1,$Zlo,$Zlo
167	xor	$Thi1,$Zhi,$Zhi
168	ldq	$Thi0,0($nlo)
169	unop
170
171
172.Loophi$N:
173	and	$Zlo,0x0f,$remp
174	sll	$Zhi,60,$t0
175	subq	$cnt,1,$cnt
176	srl	$Zlo,4,$Zlo
177
178	ldq	$Tlo1,8($nhi)
179	xor	$rem,$Zhi,$Zhi
180	ldq	$Thi1,0($nhi)
181	s8addq	$remp,$rem_4bit,$remp
182
183	ldq	$rem,0($remp)
184	srl	$Zhi,4,$Zhi
185	xor	$t0,$Zlo,$Zlo
186	extbl	$Xhi,$cnt,$nlo
187
188	and	$nlo,0xf0,$nhi
189	xor	$Thi0,$Zhi,$Zhi
190	xor	$Tlo0,$Zlo,$Zlo
191	sll	$nlo,4,$nlo
192
193
194	and	$Zlo,0x0f,$remp
195	sll	$Zhi,60,$t0
196	and	$nlo,0xf0,$nlo
197	srl	$Zlo,4,$Zlo
198
199	s8addq	$remp,$rem_4bit,$remp
200	xor	$rem,$Zhi,$Zhi
201	addq	$nlo,$Htbl,$nlo
202	addq	$nhi,$Htbl,$nhi
203
204	ldq	$rem,0($remp)
205	srl	$Zhi,4,$Zhi
206	ldq	$Tlo0,8($nlo)
207	xor	$t0,$Zlo,$Zlo
208
209	xor	$Tlo1,$Zlo,$Zlo
210	xor	$Thi1,$Zhi,$Zhi
211	ldq	$Thi0,0($nlo)
212	bne	$cnt,.Loophi$N
213
214
215	and	$Zlo,0x0f,$remp
216	sll	$Zhi,60,$t0
217	srl	$Zlo,4,$Zlo
218
219	ldq	$Tlo1,8($nhi)
220	xor	$rem,$Zhi,$Zhi
221	ldq	$Thi1,0($nhi)
222	s8addq	$remp,$rem_4bit,$remp
223
224	ldq	$rem,0($remp)
225	srl	$Zhi,4,$Zhi
226	xor	$t0,$Zlo,$Zlo
227
228	xor	$Tlo0,$Zlo,$Zlo
229	xor	$Thi0,$Zhi,$Zhi
230
231	and	$Zlo,0x0f,$remp
232	sll	$Zhi,60,$t0
233	srl	$Zlo,4,$Zlo
234
235	s8addq	$remp,$rem_4bit,$remp
236	xor	$rem,$Zhi,$Zhi
237
238	ldq	$rem,0($remp)
239	srl	$Zhi,4,$Zhi
240	xor	$Tlo1,$Zlo,$Zlo
241	xor	$Thi1,$Zhi,$Zhi
242	xor	$t0,$Zlo,$Zlo
243	xor	$rem,$Zhi,$Zhi
244___
245}}
246
247$code=<<___;
248#include <machine/asm.h>
249
250.text
251
252.set	noat
253.set	noreorder
254.globl	gcm_gmult_4bit
255.align	4
256.ent	gcm_gmult_4bit
257gcm_gmult_4bit:
258	.frame	sp,0,ra
259	.prologue 0
260
261	ldq	$Xlo,8($Xi)
262	ldq	$Xhi,0($Xi)
263
264	lda	$rem_4bit,rem_4bit
265___
266
267	&loop();
268
269$code.=<<___;
270	srl	$Zlo,24,$t0	# byte swap
271	srl	$Zlo,8,$t1
272
273	sll	$Zlo,8,$t2
274	sll	$Zlo,24,$Zlo
275	zapnot	$t0,0x11,$t0
276	zapnot	$t1,0x22,$t1
277
278	zapnot	$Zlo,0x88,$Zlo
279	or	$t0,$t1,$t0
280	zapnot	$t2,0x44,$t2
281
282	or	$Zlo,$t0,$Zlo
283	srl	$Zhi,24,$t0
284	srl	$Zhi,8,$t1
285
286	or	$Zlo,$t2,$Zlo
287	sll	$Zhi,8,$t2
288	sll	$Zhi,24,$Zhi
289
290	srl	$Zlo,32,$Xlo
291	sll	$Zlo,32,$Zlo
292
293	zapnot	$t0,0x11,$t0
294	zapnot	$t1,0x22,$t1
295	or	$Zlo,$Xlo,$Xlo
296
297	zapnot	$Zhi,0x88,$Zhi
298	or	$t0,$t1,$t0
299	zapnot	$t2,0x44,$t2
300
301	or	$Zhi,$t0,$Zhi
302	or	$Zhi,$t2,$Zhi
303
304	srl	$Zhi,32,$Xhi
305	sll	$Zhi,32,$Zhi
306
307	or	$Zhi,$Xhi,$Xhi
308	stq	$Xlo,8($Xi)
309	stq	$Xhi,0($Xi)
310
311	ret	(ra)
312.end	gcm_gmult_4bit
313___
314
315$inhi="s0";
316$inlo="s1";
317
318$code.=<<___;
319.globl	gcm_ghash_4bit
320.align	4
321.ent	gcm_ghash_4bit
322gcm_ghash_4bit:
323	lda	sp,-32(sp)
324	stq	ra,0(sp)
325	stq	s0,8(sp)
326	stq	s1,16(sp)
327	.mask	0x04000600,-32
328	.frame	sp,32,ra
329	.prologue 0
330
331	ldq_u	$inhi,0($inp)
332	ldq_u	$Thi0,7($inp)
333	ldq_u	$inlo,8($inp)
334	ldq_u	$Tlo0,15($inp)
335	ldq	$Xhi,0($Xi)
336	ldq	$Xlo,8($Xi)
337
338	lda	$rem_4bit,rem_4bit
339
340.Louter:
341	extql	$inhi,$inp,$inhi
342	extqh	$Thi0,$inp,$Thi0
343	or	$inhi,$Thi0,$inhi
344	lda	$inp,16($inp)
345
346	extql	$inlo,$inp,$inlo
347	extqh	$Tlo0,$inp,$Tlo0
348	or	$inlo,$Tlo0,$inlo
349	subq	$len,16,$len
350
351	xor	$Xlo,$inlo,$Xlo
352	xor	$Xhi,$inhi,$Xhi
353___
354
355	&loop();
356
357$code.=<<___;
358	srl	$Zlo,24,$t0	# byte swap
359	srl	$Zlo,8,$t1
360
361	sll	$Zlo,8,$t2
362	sll	$Zlo,24,$Zlo
363	zapnot	$t0,0x11,$t0
364	zapnot	$t1,0x22,$t1
365
366	zapnot	$Zlo,0x88,$Zlo
367	or	$t0,$t1,$t0
368	zapnot	$t2,0x44,$t2
369
370	or	$Zlo,$t0,$Zlo
371	srl	$Zhi,24,$t0
372	srl	$Zhi,8,$t1
373
374	or	$Zlo,$t2,$Zlo
375	sll	$Zhi,8,$t2
376	sll	$Zhi,24,$Zhi
377
378	srl	$Zlo,32,$Xlo
379	sll	$Zlo,32,$Zlo
380	beq	$len,.Ldone
381
382	zapnot	$t0,0x11,$t0
383	zapnot	$t1,0x22,$t1
384	or	$Zlo,$Xlo,$Xlo
385	ldq_u	$inhi,0($inp)
386
387	zapnot	$Zhi,0x88,$Zhi
388	or	$t0,$t1,$t0
389	zapnot	$t2,0x44,$t2
390	ldq_u	$Thi0,7($inp)
391
392	or	$Zhi,$t0,$Zhi
393	or	$Zhi,$t2,$Zhi
394	ldq_u	$inlo,8($inp)
395	ldq_u	$Tlo0,15($inp)
396
397	srl	$Zhi,32,$Xhi
398	sll	$Zhi,32,$Zhi
399
400	or	$Zhi,$Xhi,$Xhi
401	br	zero,.Louter
402
403.Ldone:
404	zapnot	$t0,0x11,$t0
405	zapnot	$t1,0x22,$t1
406	or	$Zlo,$Xlo,$Xlo
407
408	zapnot	$Zhi,0x88,$Zhi
409	or	$t0,$t1,$t0
410	zapnot	$t2,0x44,$t2
411
412	or	$Zhi,$t0,$Zhi
413	or	$Zhi,$t2,$Zhi
414
415	srl	$Zhi,32,$Xhi
416	sll	$Zhi,32,$Zhi
417
418	or	$Zhi,$Xhi,$Xhi
419
420	stq	$Xlo,8($Xi)
421	stq	$Xhi,0($Xi)
422
423	.set	noreorder
424	/*ldq	ra,0(sp)*/
425	ldq	s0,8(sp)
426	ldq	s1,16(sp)
427	lda	sp,32(sp)
428	ret	(ra)
429.end	gcm_ghash_4bit
430
431	.section .rodata
432	.align	4
433rem_4bit:
434	.long	0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
435	.long	0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
436	.long	0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
437	.long	0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
438	.previous
439
440___
441$output=shift and open STDOUT,">$output";
442print $code;
443close STDOUT;
444
445