1#!/usr/bin/env perl
2
3# ====================================================================
4# Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
5#
6# This module may be used under the terms of either the GNU General
7# Public License version 2 or later, the GNU Lesser General Public
8# License version 2.1 or later, the Mozilla Public License version
9# 1.1 or the BSD License. The exact terms of either license are
10# distributed along with this module. For further details see
11# http://www.openssl.org/~appro/camellia/.
12# ====================================================================
13
14# Performance in cycles per processed byte (less is better) in
15# 'openssl speed ...' benchmark:
16#
17#			AMD64	Core2	EM64T
18# -evp camellia-128-ecb	16.7	21.0	22.7
19# + over gcc 3.4.6	+25%	+5%	0%
20#
21# camellia-128-cbc	15.7	20.4	21.1
22#
23# 128-bit key setup	128	216	205	cycles/key
24# + over gcc 3.4.6	+54%	+39%	+15%
25#
26# Numbers in "+" rows represent performance improvement over compiler
27# generated code. Key setup timings are impressive on AMD and Core2
28# thanks to 64-bit operations being covertly deployed. Improvement on
29# EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
30# apparently emulates some of 64-bit operations in [32-bit] microcode.
31
32$flavour = shift;
33$output  = shift;
34if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
35
36$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
37
38$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
39( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
40( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
41die "can't locate x86_64-xlate.pl";
42
43open OUT,"| \"$^X\" $xlate $flavour $output";
44*STDOUT=*OUT;
45
46sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/;    $r; }
47sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
48                        $r =~ s/%[er]([sd]i)/%\1l/;
49                        $r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
50
51$t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
52@S=("%r8d","%r9d","%r10d","%r11d");
53$i0="%esi";
54$i1="%edi";
55$Tbl="%rbp";	# size optimization
56$inp="%r12";
57$out="%r13";
58$key="%r14";
59$keyend="%r15";
60$arg0d=$win64?"%ecx":"%edi";
61
62# const unsigned int Camellia_SBOX[4][256];
63# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
64# and [2][] - with [3][]. This is done to minimize code size.
65$SBOX1_1110=0;		# Camellia_SBOX[0]
66$SBOX4_4404=4;		# Camellia_SBOX[1]
67$SBOX2_0222=2048;	# Camellia_SBOX[2]
68$SBOX3_3033=2052;	# Camellia_SBOX[3]
69
70sub Camellia_Feistel {
71my $i=@_[0];
72my $seed=defined(@_[1])?@_[1]:0;
73my $scale=$seed<0?-8:8;
74my $j=($i&1)*2;
75my $s0=@S[($j)%4],$s1=@S[($j+1)%4],$s2=@S[($j+2)%4],$s3=@S[($j+3)%4];
76
77$code.=<<___;
78	xor	$s0,$t0				# t0^=key[0]
79	xor	$s1,$t1				# t1^=key[1]
80	movz	`&hi("$t0")`,$i0		# (t0>>8)&0xff
81	movz	`&lo("$t1")`,$i1		# (t1>>0)&0xff
82	mov	$SBOX3_3033($Tbl,$i0,8),$t3	# t3=SBOX3_3033[0]
83	mov	$SBOX1_1110($Tbl,$i1,8),$t2	# t2=SBOX1_1110[1]
84	movz	`&lo("$t0")`,$i0		# (t0>>0)&0xff
85	shr	\$16,$t0
86	movz	`&hi("$t1")`,$i1		# (t1>>8)&0xff
87	xor	$SBOX4_4404($Tbl,$i0,8),$t3	# t3^=SBOX4_4404[0]
88	shr	\$16,$t1
89	xor	$SBOX4_4404($Tbl,$i1,8),$t2	# t2^=SBOX4_4404[1]
90	movz	`&hi("$t0")`,$i0		# (t0>>24)&0xff
91	movz	`&lo("$t1")`,$i1		# (t1>>16)&0xff
92	xor	$SBOX1_1110($Tbl,$i0,8),$t3	# t3^=SBOX1_1110[0]
93	xor	$SBOX3_3033($Tbl,$i1,8),$t2	# t2^=SBOX3_3033[1]
94	movz	`&lo("$t0")`,$i0		# (t0>>16)&0xff
95	movz	`&hi("$t1")`,$i1		# (t1>>24)&0xff
96	xor	$SBOX2_0222($Tbl,$i0,8),$t3	# t3^=SBOX2_0222[0]
97	xor	$SBOX2_0222($Tbl,$i1,8),$t2	# t2^=SBOX2_0222[1]
98	mov	`$seed+($i+1)*$scale`($key),$t1	# prefetch key[i+1]
99	mov	`$seed+($i+1)*$scale+4`($key),$t0
100	xor	$t3,$t2				# t2^=t3
101	ror	\$8,$t3				# t3=RightRotate(t3,8)
102	xor	$t2,$s2
103	xor	$t2,$s3
104	xor	$t3,$s3
105___
106}
107
108# void Camellia_EncryptBlock_Rounds(
109#		int grandRounds,
110#		const Byte plaintext[],
111#		const KEY_TABLE_TYPE keyTable,
112#		Byte ciphertext[])
113$code=<<___;
114.text
115
116# V1.x API
117.globl	Camellia_EncryptBlock
118.type	Camellia_EncryptBlock,\@abi-omnipotent
119.align	16
120Camellia_EncryptBlock:
121	movl	\$128,%eax
122	subl	$arg0d,%eax
123	movl	\$3,$arg0d
124	adcl	\$0,$arg0d	# keyBitLength==128?3:4
125	jmp	.Lenc_rounds
126.size	Camellia_EncryptBlock,.-Camellia_EncryptBlock
127# V2
128.globl	Camellia_EncryptBlock_Rounds
129.type	Camellia_EncryptBlock_Rounds,\@function,4
130.align	16
131.Lenc_rounds:
132Camellia_EncryptBlock_Rounds:
133	push	%rbx
134	push	%rbp
135	push	%r13
136	push	%r14
137	push	%r15
138.Lenc_prologue:
139
140	#mov	%rsi,$inp		# put away arguments
141	mov	%rcx,$out
142	mov	%rdx,$key
143
144	shl	\$6,%edi		# process grandRounds
145	lea	.LCamellia_SBOX(%rip),$Tbl
146	lea	($key,%rdi),$keyend
147
148	mov	0(%rsi),@S[0]		# load plaintext
149	mov	4(%rsi),@S[1]
150	mov	8(%rsi),@S[2]
151	bswap	@S[0]
152	mov	12(%rsi),@S[3]
153	bswap	@S[1]
154	bswap	@S[2]
155	bswap	@S[3]
156
157	call	_x86_64_Camellia_encrypt
158
159	bswap	@S[0]
160	bswap	@S[1]
161	bswap	@S[2]
162	mov	@S[0],0($out)
163	bswap	@S[3]
164	mov	@S[1],4($out)
165	mov	@S[2],8($out)
166	mov	@S[3],12($out)
167
168	mov	0(%rsp),%r15
169	mov	8(%rsp),%r14
170	mov	16(%rsp),%r13
171	mov	24(%rsp),%rbp
172	mov	32(%rsp),%rbx
173	lea	40(%rsp),%rsp
174.Lenc_epilogue:
175	ret
176.size	Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
177
178.type	_x86_64_Camellia_encrypt,\@abi-omnipotent
179.align	16
180_x86_64_Camellia_encrypt:
181	xor	0($key),@S[1]
182	xor	4($key),@S[0]		# ^=key[0-3]
183	xor	8($key),@S[3]
184	xor	12($key),@S[2]
185.align	16
186.Leloop:
187	mov	16($key),$t1		# prefetch key[4-5]
188	mov	20($key),$t0
189
190___
191	for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }
192$code.=<<___;
193	lea	16*4($key),$key
194	cmp	$keyend,$key
195	mov	8($key),$t3		# prefetch key[2-3]
196	mov	12($key),$t2
197	je	.Ledone
198
199	and	@S[0],$t0
200	or	@S[3],$t3
201	rol	\$1,$t0
202	xor	$t3,@S[2]		# s2^=s3|key[3];
203	xor	$t0,@S[1]		# s1^=LeftRotate(s0&key[0],1);
204	and	@S[2],$t2
205	or	@S[1],$t1
206	rol	\$1,$t2
207	xor	$t1,@S[0]		# s0^=s1|key[1];
208	xor	$t2,@S[3]		# s3^=LeftRotate(s2&key[2],1);
209	jmp	.Leloop
210
211.align	16
212.Ledone:
213	xor	@S[2],$t0		# SwapHalf
214	xor	@S[3],$t1
215	xor	@S[0],$t2
216	xor	@S[1],$t3
217
218	mov	$t0,@S[0]
219	mov	$t1,@S[1]
220	mov	$t2,@S[2]
221	mov	$t3,@S[3]
222
223	.byte	0xf3,0xc3		# rep ret
224.size	_x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
225
226# V1.x API
227.globl	Camellia_DecryptBlock
228.type	Camellia_DecryptBlock,\@abi-omnipotent
229.align	16
230Camellia_DecryptBlock:
231	movl	\$128,%eax
232	subl	$arg0d,%eax
233	movl	\$3,$arg0d
234	adcl	\$0,$arg0d	# keyBitLength==128?3:4
235	jmp	.Ldec_rounds
236.size	Camellia_DecryptBlock,.-Camellia_DecryptBlock
237# V2
238.globl	Camellia_DecryptBlock_Rounds
239.type	Camellia_DecryptBlock_Rounds,\@function,4
240.align	16
241.Ldec_rounds:
242Camellia_DecryptBlock_Rounds:
243	push	%rbx
244	push	%rbp
245	push	%r13
246	push	%r14
247	push	%r15
248.Ldec_prologue:
249
250	#mov	%rsi,$inp		# put away arguments
251	mov	%rcx,$out
252	mov	%rdx,$keyend
253
254	shl	\$6,%edi		# process grandRounds
255	lea	.LCamellia_SBOX(%rip),$Tbl
256	lea	($keyend,%rdi),$key
257
258	mov	0(%rsi),@S[0]		# load plaintext
259	mov	4(%rsi),@S[1]
260	mov	8(%rsi),@S[2]
261	bswap	@S[0]
262	mov	12(%rsi),@S[3]
263	bswap	@S[1]
264	bswap	@S[2]
265	bswap	@S[3]
266
267	call	_x86_64_Camellia_decrypt
268
269	bswap	@S[0]
270	bswap	@S[1]
271	bswap	@S[2]
272	mov	@S[0],0($out)
273	bswap	@S[3]
274	mov	@S[1],4($out)
275	mov	@S[2],8($out)
276	mov	@S[3],12($out)
277
278	mov	0(%rsp),%r15
279	mov	8(%rsp),%r14
280	mov	16(%rsp),%r13
281	mov	24(%rsp),%rbp
282	mov	32(%rsp),%rbx
283	lea	40(%rsp),%rsp
284.Ldec_epilogue:
285	ret
286.size	Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
287
288.type	_x86_64_Camellia_decrypt,\@abi-omnipotent
289.align	16
290_x86_64_Camellia_decrypt:
291	xor	0($key),@S[1]
292	xor	4($key),@S[0]		# ^=key[0-3]
293	xor	8($key),@S[3]
294	xor	12($key),@S[2]
295.align	16
296.Ldloop:
297	mov	-8($key),$t1		# prefetch key[4-5]
298	mov	-4($key),$t0
299
300___
301	for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }
302$code.=<<___;
303	lea	-16*4($key),$key
304	cmp	$keyend,$key
305	mov	0($key),$t3		# prefetch key[2-3]
306	mov	4($key),$t2
307	je	.Lddone
308
309	and	@S[0],$t0
310	or	@S[3],$t3
311	rol	\$1,$t0
312	xor	$t3,@S[2]		# s2^=s3|key[3];
313	xor	$t0,@S[1]		# s1^=LeftRotate(s0&key[0],1);
314	and	@S[2],$t2
315	or	@S[1],$t1
316	rol	\$1,$t2
317	xor	$t1,@S[0]		# s0^=s1|key[1];
318	xor	$t2,@S[3]		# s3^=LeftRotate(s2&key[2],1);
319
320	jmp	.Ldloop
321
322.align	16
323.Lddone:
324	xor	@S[2],$t2
325	xor	@S[3],$t3
326	xor	@S[0],$t0
327	xor	@S[1],$t1
328
329	mov	$t2,@S[0]		# SwapHalf
330	mov	$t3,@S[1]
331	mov	$t0,@S[2]
332	mov	$t1,@S[3]
333
334	.byte	0xf3,0xc3		# rep ret
335.size	_x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
336___
337
338sub _saveround {
339my ($rnd,$key,@T)=@_;
340my $bias=int(@T[0])?shift(@T):0;
341
342    if ($#T==3) {
343	$code.=<<___;
344	mov	@T[1],`$bias+$rnd*8+0`($key)
345	mov	@T[0],`$bias+$rnd*8+4`($key)
346	mov	@T[3],`$bias+$rnd*8+8`($key)
347	mov	@T[2],`$bias+$rnd*8+12`($key)
348___
349    } else {
350	$code.="	mov	@T[0],`$bias+$rnd*8+0`($key)\n";
351	$code.="	mov	@T[1],`$bias+$rnd*8+8`($key)\n"	if ($#T>=1);
352    }
353}
354
355sub _loadround {
356my ($rnd,$key,@T)=@_;
357my $bias=int(@T[0])?shift(@T):0;
358
359$code.="	mov	`$bias+$rnd*8+0`($key),@T[0]\n";
360$code.="	mov	`$bias+$rnd*8+8`($key),@T[1]\n"	if ($#T>=1);
361}
362
363# shld is very slow on Intel EM64T family. Even on AMD it limits
364# instruction decode rate [because it's VectorPath] and consequently
365# performance...
366sub __rotl128 {
367my ($i0,$i1,$rot)=@_;
368
369    if ($rot) {
370	$code.=<<___;
371	mov	$i0,%r11
372	shld	\$$rot,$i1,$i0
373	shld	\$$rot,%r11,$i1
374___
375    }
376}
377
378# ... Implementing 128-bit rotate without shld gives 80% better
379# performance EM64T, +15% on AMD64 and only ~7% degradation on
380# Core2. This is therefore preferred.
381sub _rotl128 {
382my ($i0,$i1,$rot)=@_;
383
384    if ($rot) {
385	$code.=<<___;
386	mov	$i0,%r11
387	shl	\$$rot,$i0
388	mov	$i1,%r9
389	shr	\$`64-$rot`,%r9
390	shr	\$`64-$rot`,%r11
391	or	%r9,$i0
392	shl	\$$rot,$i1
393	or	%r11,$i1
394___
395    }
396}
397
398{ my $step=0;
399
400$code.=<<___;
401.globl	Camellia_Ekeygen
402.type	Camellia_Ekeygen,\@function,3
403.align	16
404Camellia_Ekeygen:
405	push	%rbx
406	push	%rbp
407	push	%r13
408	push	%r14
409	push	%r15
410.Lkey_prologue:
411
412	mov	%rdi,$keyend		# put away arguments, keyBitLength
413	mov	%rdx,$out		# keyTable
414
415	mov	0(%rsi),@S[0]		# load 0-127 bits
416	mov	4(%rsi),@S[1]
417	mov	8(%rsi),@S[2]
418	mov	12(%rsi),@S[3]
419
420	bswap	@S[0]
421	bswap	@S[1]
422	bswap	@S[2]
423	bswap	@S[3]
424___
425	&_saveround	(0,$out,@S);	# KL<<<0
426$code.=<<___;
427	cmp	\$128,$keyend		# check keyBitLength
428	je	.L1st128
429
430	mov	16(%rsi),@S[0]		# load 128-191 bits
431	mov	20(%rsi),@S[1]
432	cmp	\$192,$keyend
433	je	.L1st192
434	mov	24(%rsi),@S[2]		# load 192-255 bits
435	mov	28(%rsi),@S[3]
436	jmp	.L1st256
437.L1st192:
438	mov	@S[0],@S[2]
439	mov	@S[1],@S[3]
440	not	@S[2]
441	not	@S[3]
442.L1st256:
443	bswap	@S[0]
444	bswap	@S[1]
445	bswap	@S[2]
446	bswap	@S[3]
447___
448	&_saveround	(4,$out,@S);	# temp storage for KR!
449$code.=<<___;
450	xor	0($out),@S[1]		# KR^KL
451	xor	4($out),@S[0]
452	xor	8($out),@S[3]
453	xor	12($out),@S[2]
454
455.L1st128:
456	lea	.LCamellia_SIGMA(%rip),$key
457	lea	.LCamellia_SBOX(%rip),$Tbl
458
459	mov	0($key),$t1
460	mov	4($key),$t0
461___
462	&Camellia_Feistel($step++);
463	&Camellia_Feistel($step++);
464$code.=<<___;
465	xor	0($out),@S[1]		# ^KL
466	xor	4($out),@S[0]
467	xor	8($out),@S[3]
468	xor	12($out),@S[2]
469___
470	&Camellia_Feistel($step++);
471	&Camellia_Feistel($step++);
472$code.=<<___;
473	cmp	\$128,$keyend
474	jne	.L2nd256
475
476	lea	128($out),$out		# size optimization
477	shl	\$32,%r8		# @S[0]||
478	shl	\$32,%r10		# @S[2]||
479	or	%r9,%r8			# ||@S[1]
480	or	%r11,%r10		# ||@S[3]
481___
482	&_loadround	(0,$out,-128,"%rax","%rbx");	# KL
483	&_saveround	(2,$out,-128,"%r8","%r10");	# KA<<<0
484	&_rotl128	("%rax","%rbx",15);
485	&_saveround	(4,$out,-128,"%rax","%rbx");	# KL<<<15
486	&_rotl128	("%r8","%r10",15);
487	&_saveround	(6,$out,-128,"%r8","%r10");	# KA<<<15
488	&_rotl128	("%r8","%r10",15);		# 15+15=30
489	&_saveround	(8,$out,-128,"%r8","%r10");	# KA<<<30
490	&_rotl128	("%rax","%rbx",30);		# 15+30=45
491	&_saveround	(10,$out,-128,"%rax","%rbx");	# KL<<<45
492	&_rotl128	("%r8","%r10",15);		# 30+15=45
493	&_saveround	(12,$out,-128,"%r8");		# KA<<<45
494	&_rotl128	("%rax","%rbx",15);		# 45+15=60
495	&_saveround	(13,$out,-128,"%rbx");		# KL<<<60
496	&_rotl128	("%r8","%r10",15);		# 45+15=60
497	&_saveround	(14,$out,-128,"%r8","%r10");	# KA<<<60
498	&_rotl128	("%rax","%rbx",17);		# 60+17=77
499	&_saveround	(16,$out,-128,"%rax","%rbx");	# KL<<<77
500	&_rotl128	("%rax","%rbx",17);		# 77+17=94
501	&_saveround	(18,$out,-128,"%rax","%rbx");	# KL<<<94
502	&_rotl128	("%r8","%r10",34);		# 60+34=94
503	&_saveround	(20,$out,-128,"%r8","%r10");	# KA<<<94
504	&_rotl128	("%rax","%rbx",17);		# 94+17=111
505	&_saveround	(22,$out,-128,"%rax","%rbx");	# KL<<<111
506	&_rotl128	("%r8","%r10",17);		# 94+17=111
507	&_saveround	(24,$out,-128,"%r8","%r10");	# KA<<<111
508$code.=<<___;
509	mov	\$3,%eax
510	jmp	.Ldone
511.align	16
512.L2nd256:
513___
514	&_saveround	(6,$out,@S);	# temp storage for KA!
515$code.=<<___;
516	xor	`4*8+0`($out),@S[1]	# KA^KR
517	xor	`4*8+4`($out),@S[0]
518	xor	`5*8+0`($out),@S[3]
519	xor	`5*8+4`($out),@S[2]
520___
521	&Camellia_Feistel($step++);
522	&Camellia_Feistel($step++);
523
524	&_loadround	(0,$out,"%rax","%rbx");	# KL
525	&_loadround	(4,$out,"%rcx","%rdx");	# KR
526	&_loadround	(6,$out,"%r14","%r15");	# KA
527$code.=<<___;
528	lea	128($out),$out		# size optimization
529	shl	\$32,%r8		# @S[0]||
530	shl	\$32,%r10		# @S[2]||
531	or	%r9,%r8			# ||@S[1]
532	or	%r11,%r10		# ||@S[3]
533___
534	&_saveround	(2,$out,-128,"%r8","%r10");	# KB<<<0
535	&_rotl128	("%rcx","%rdx",15);
536	&_saveround	(4,$out,-128,"%rcx","%rdx");	# KR<<<15
537	&_rotl128	("%r14","%r15",15);
538	&_saveround	(6,$out,-128,"%r14","%r15");	# KA<<<15
539	&_rotl128	("%rcx","%rdx",15);		# 15+15=30
540	&_saveround	(8,$out,-128,"%rcx","%rdx");	# KR<<<30
541	&_rotl128	("%r8","%r10",30);
542	&_saveround	(10,$out,-128,"%r8","%r10");	# KB<<<30
543	&_rotl128	("%rax","%rbx",45);
544	&_saveround	(12,$out,-128,"%rax","%rbx");	# KL<<<45
545	&_rotl128	("%r14","%r15",30);		# 15+30=45
546	&_saveround	(14,$out,-128,"%r14","%r15");	# KA<<<45
547	&_rotl128	("%rax","%rbx",15);		# 45+15=60
548	&_saveround	(16,$out,-128,"%rax","%rbx");	# KL<<<60
549	&_rotl128	("%rcx","%rdx",30);		# 30+30=60
550	&_saveround	(18,$out,-128,"%rcx","%rdx");	# KR<<<60
551	&_rotl128	("%r8","%r10",30);		# 30+30=60
552	&_saveround	(20,$out,-128,"%r8","%r10");	# KB<<<60
553	&_rotl128	("%rax","%rbx",17);		# 60+17=77
554	&_saveround	(22,$out,-128,"%rax","%rbx");	# KL<<<77
555	&_rotl128	("%r14","%r15",32);		# 45+32=77
556	&_saveround	(24,$out,-128,"%r14","%r15");	# KA<<<77
557	&_rotl128	("%rcx","%rdx",34);		# 60+34=94
558	&_saveround	(26,$out,-128,"%rcx","%rdx");	# KR<<<94
559	&_rotl128	("%r14","%r15",17);		# 77+17=94
560	&_saveround	(28,$out,-128,"%r14","%r15");	# KA<<<77
561	&_rotl128	("%rax","%rbx",34);		# 77+34=111
562	&_saveround	(30,$out,-128,"%rax","%rbx");	# KL<<<111
563	&_rotl128	("%r8","%r10",51);		# 60+51=111
564	&_saveround	(32,$out,-128,"%r8","%r10");	# KB<<<111
565$code.=<<___;
566	mov	\$4,%eax
567.Ldone:
568	mov	0(%rsp),%r15
569	mov	8(%rsp),%r14
570	mov	16(%rsp),%r13
571	mov	24(%rsp),%rbp
572	mov	32(%rsp),%rbx
573	lea	40(%rsp),%rsp
574.Lkey_epilogue:
575	ret
576.size	Camellia_Ekeygen,.-Camellia_Ekeygen
577___
578}
579
580@SBOX=(
581112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
582 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
583134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
584166,225, 57,202,213, 71, 93, 61,217,  1, 90,214, 81, 86,108, 77,
585139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
586223, 76,203,194, 52,126,118,  5,109,183,169, 49,209, 23,  4,215,
587 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
588254, 68,207,178,195,181,122,145, 36,  8,232,168, 96,252,105, 80,
589170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
590 16,196,  0, 72,163,247,117,219,138,  3,230,218,  9, 63,221,148,
591135, 92,131,  2,205, 74,144, 51,115,103,246,243,157,127,191,226,
592 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
593233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
594120,152,  6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
595114,  7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
596 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
597
598sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); }
599sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); }
600sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); }
601sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); }
602
603$code.=<<___;
604.align	64
605.LCamellia_SIGMA:
606.long	0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
607.long	0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
608.long	0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
609.long	0,          0,          0,          0
610.LCamellia_SBOX:
611___
612# tables are interleaved, remember?
613sub data_word { $code.=".long\t".join(',',@_)."\n"; }
614for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
615for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
616
617# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
618#			size_t length, const CAMELLIA_KEY *key,
619#			unsigned char *ivp,const int enc);
620{
621$_key="0(%rsp)";
622$_end="8(%rsp)";	# inp+len&~15
623$_res="16(%rsp)";	# len&15
624$ivec="24(%rsp)";
625$_ivp="40(%rsp)";
626$_rsp="48(%rsp)";
627
628$code.=<<___;
629.globl	Camellia_cbc_encrypt
630.type	Camellia_cbc_encrypt,\@function,6
631.align	16
632Camellia_cbc_encrypt:
633	cmp	\$0,%rdx
634	je	.Lcbc_abort
635	push	%rbx
636	push	%rbp
637	push	%r12
638	push	%r13
639	push	%r14
640	push	%r15
641.Lcbc_prologue:
642
643	mov	%rsp,%rbp
644	sub	\$64,%rsp
645	and	\$-64,%rsp
646
647	# place stack frame just "above mod 1024" the key schedule,
648	# this ensures that cache associativity suffices
649	lea	-64-63(%rcx),%r10
650	sub	%rsp,%r10
651	neg	%r10
652	and	\$0x3C0,%r10
653	sub	%r10,%rsp
654	#add	\$8,%rsp		# 8 is reserved for callee's ra
655
656	mov	%rdi,$inp		# inp argument
657	mov	%rsi,$out		# out argument
658	mov	%r8,%rbx		# ivp argument
659	mov	%rcx,$key		# key argument
660	mov	272(%rcx),${keyend}d	# grandRounds
661
662	mov	%r8,$_ivp
663	mov	%rbp,$_rsp
664
665.Lcbc_body:
666	lea	.LCamellia_SBOX(%rip),$Tbl
667
668	mov	\$32,%ecx
669.align	4
670.Lcbc_prefetch_sbox:
671	mov	0($Tbl),%rax
672	mov	32($Tbl),%rsi
673	mov	64($Tbl),%rdi
674	mov	96($Tbl),%r11
675	lea	128($Tbl),$Tbl
676	loop	.Lcbc_prefetch_sbox
677	sub	\$4096,$Tbl
678	shl	\$6,$keyend
679	mov	%rdx,%rcx		# len argument
680	lea	($key,$keyend),$keyend
681
682	cmp	\$0,%r9d		# enc argument
683	je	.LCBC_DECRYPT
684
685	and	\$-16,%rdx
686	and	\$15,%rcx		# length residue
687	lea	($inp,%rdx),%rdx
688	mov	$key,$_key
689	mov	%rdx,$_end
690	mov	%rcx,$_res
691
692	cmp	$inp,%rdx
693	mov	0(%rbx),@S[0]		# load IV
694	mov	4(%rbx),@S[1]
695	mov	8(%rbx),@S[2]
696	mov	12(%rbx),@S[3]
697	je	.Lcbc_enc_tail
698	jmp	.Lcbc_eloop
699
700.align	16
701.Lcbc_eloop:
702	xor	0($inp),@S[0]
703	xor	4($inp),@S[1]
704	xor	8($inp),@S[2]
705	bswap	@S[0]
706	xor	12($inp),@S[3]
707	bswap	@S[1]
708	bswap	@S[2]
709	bswap	@S[3]
710
711	call	_x86_64_Camellia_encrypt
712
713	mov	$_key,$key		# "rewind" the key
714	bswap	@S[0]
715	mov	$_end,%rdx
716	bswap	@S[1]
717	mov	$_res,%rcx
718	bswap	@S[2]
719	mov	@S[0],0($out)
720	bswap	@S[3]
721	mov	@S[1],4($out)
722	mov	@S[2],8($out)
723	lea	16($inp),$inp
724	mov	@S[3],12($out)
725	cmp	%rdx,$inp
726	lea	16($out),$out
727	jne	.Lcbc_eloop
728
729	cmp	\$0,%rcx
730	jne	.Lcbc_enc_tail
731
732	mov	$_ivp,$out
733	mov	@S[0],0($out)		# write out IV residue
734	mov	@S[1],4($out)
735	mov	@S[2],8($out)
736	mov	@S[3],12($out)
737	jmp	.Lcbc_done
738
739.align	16
740.Lcbc_enc_tail:
741	xor	%rax,%rax
742	mov	%rax,0+$ivec
743	mov	%rax,8+$ivec
744	mov	%rax,$_res
745
746.Lcbc_enc_pushf:
747	pushfq
748	cld
749	mov	$inp,%rsi
750	lea	8+$ivec,%rdi
751	.long	0x9066A4F3		# rep movsb
752	popfq
753.Lcbc_enc_popf:
754
755	lea	$ivec,$inp
756	lea	16+$ivec,%rax
757	mov	%rax,$_end
758	jmp	.Lcbc_eloop		# one more time
759
760.align	16
761.LCBC_DECRYPT:
762	xchg	$key,$keyend
763	add	\$15,%rdx
764	and	\$15,%rcx		# length residue
765	and	\$-16,%rdx
766	mov	$key,$_key
767	lea	($inp,%rdx),%rdx
768	mov	%rdx,$_end
769	mov	%rcx,$_res
770
771	mov	(%rbx),%rax		# load IV
772	mov	8(%rbx),%rbx
773	jmp	.Lcbc_dloop
774.align	16
775.Lcbc_dloop:
776	mov	0($inp),@S[0]
777	mov	4($inp),@S[1]
778	mov	8($inp),@S[2]
779	bswap	@S[0]
780	mov	12($inp),@S[3]
781	bswap	@S[1]
782	mov	%rax,0+$ivec		# save IV to temporary storage
783	bswap	@S[2]
784	mov	%rbx,8+$ivec
785	bswap	@S[3]
786
787	call	_x86_64_Camellia_decrypt
788
789	mov	$_key,$key		# "rewind" the key
790	mov	$_end,%rdx
791	mov	$_res,%rcx
792
793	bswap	@S[0]
794	mov	($inp),%rax		# load IV for next iteration
795	bswap	@S[1]
796	mov	8($inp),%rbx
797	bswap	@S[2]
798	xor	0+$ivec,@S[0]
799	bswap	@S[3]
800	xor	4+$ivec,@S[1]
801	xor	8+$ivec,@S[2]
802	lea	16($inp),$inp
803	xor	12+$ivec,@S[3]
804	cmp	%rdx,$inp
805	je	.Lcbc_ddone
806
807	mov	@S[0],0($out)
808	mov	@S[1],4($out)
809	mov	@S[2],8($out)
810	mov	@S[3],12($out)
811
812	lea	16($out),$out
813	jmp	.Lcbc_dloop
814
815.align	16
816.Lcbc_ddone:
817	mov	$_ivp,%rdx
818	cmp	\$0,%rcx
819	jne	.Lcbc_dec_tail
820
821	mov	@S[0],0($out)
822	mov	@S[1],4($out)
823	mov	@S[2],8($out)
824	mov	@S[3],12($out)
825
826	mov	%rax,(%rdx)		# write out IV residue
827	mov	%rbx,8(%rdx)
828	jmp	.Lcbc_done
829.align	16
830.Lcbc_dec_tail:
831	mov	@S[0],0+$ivec
832	mov	@S[1],4+$ivec
833	mov	@S[2],8+$ivec
834	mov	@S[3],12+$ivec
835
836.Lcbc_dec_pushf:
837	pushfq
838	cld
839	lea	8+$ivec,%rsi
840	lea	($out),%rdi
841	.long	0x9066A4F3		# rep movsb
842	popfq
843.Lcbc_dec_popf:
844
845	mov	%rax,(%rdx)		# write out IV residue
846	mov	%rbx,8(%rdx)
847	jmp	.Lcbc_done
848
849.align	16
850.Lcbc_done:
851	mov	$_rsp,%rcx
852	mov	0(%rcx),%r15
853	mov	8(%rcx),%r14
854	mov	16(%rcx),%r13
855	mov	24(%rcx),%r12
856	mov	32(%rcx),%rbp
857	mov	40(%rcx),%rbx
858	lea	48(%rcx),%rsp
859.Lcbc_abort:
860	ret
861.size	Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
862
863.asciz	"Camellia for x86_64 by <appro\@openssl.org>"
864___
865}
866
867# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
868#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
869if ($win64) {
870$rec="%rcx";
871$frame="%rdx";
872$context="%r8";
873$disp="%r9";
874
875$code.=<<___;
876.extern	__imp_RtlVirtualUnwind
877.type	common_se_handler,\@abi-omnipotent
878.align	16
879common_se_handler:
880	push	%rsi
881	push	%rdi
882	push	%rbx
883	push	%rbp
884	push	%r12
885	push	%r13
886	push	%r14
887	push	%r15
888	pushfq
889	lea	-64(%rsp),%rsp
890
891	mov	120($context),%rax	# pull context->Rax
892	mov	248($context),%rbx	# pull context->Rip
893
894	mov	8($disp),%rsi		# disp->ImageBase
895	mov	56($disp),%r11		# disp->HandlerData
896
897	mov	0(%r11),%r10d		# HandlerData[0]
898	lea	(%rsi,%r10),%r10	# prologue label
899	cmp	%r10,%rbx		# context->Rip<prologue label
900	jb	.Lin_prologue
901
902	mov	152($context),%rax	# pull context->Rsp
903
904	mov	4(%r11),%r10d		# HandlerData[1]
905	lea	(%rsi,%r10),%r10	# epilogue label
906	cmp	%r10,%rbx		# context->Rip>=epilogue label
907	jae	.Lin_prologue
908
909	lea	40(%rax),%rax
910	mov	-8(%rax),%rbx
911	mov	-16(%rax),%rbp
912	mov	-24(%rax),%r13
913	mov	-32(%rax),%r14
914	mov	-40(%rax),%r15
915	mov	%rbx,144($context)	# restore context->Rbx
916	mov	%rbp,160($context)	# restore context->Rbp
917	mov	%r13,224($context)	# restore context->R13
918	mov	%r14,232($context)	# restore context->R14
919	mov	%r15,240($context)	# restore context->R15
920
921.Lin_prologue:
922	mov	8(%rax),%rdi
923	mov	16(%rax),%rsi
924	mov	%rax,152($context)	# restore context->Rsp
925	mov	%rsi,168($context)	# restore context->Rsi
926	mov	%rdi,176($context)	# restore context->Rdi
927
928	jmp	.Lcommon_seh_exit
929.size	common_se_handler,.-common_se_handler
930
931.type	cbc_se_handler,\@abi-omnipotent
932.align	16
933cbc_se_handler:
934	push	%rsi
935	push	%rdi
936	push	%rbx
937	push	%rbp
938	push	%r12
939	push	%r13
940	push	%r14
941	push	%r15
942	pushfq
943	lea	-64(%rsp),%rsp
944
945	mov	120($context),%rax	# pull context->Rax
946	mov	248($context),%rbx	# pull context->Rip
947
948	lea	.Lcbc_prologue(%rip),%r10
949	cmp	%r10,%rbx		# context->Rip<.Lcbc_prologue
950	jb	.Lin_cbc_prologue
951
952	lea	.Lcbc_body(%rip),%r10
953	cmp	%r10,%rbx		# context->Rip<.Lcbc_body
954	jb	.Lin_cbc_frame_setup
955
956	mov	152($context),%rax	# pull context->Rsp
957
958	lea	.Lcbc_abort(%rip),%r10
959	cmp	%r10,%rbx		# context->Rip>=.Lcbc_abort
960	jae	.Lin_cbc_prologue
961
962	# handle pushf/popf in Camellia_cbc_encrypt
963	lea	.Lcbc_enc_pushf(%rip),%r10
964	cmp	%r10,%rbx		# context->Rip<=.Lcbc_enc_pushf
965	jbe	.Lin_cbc_no_flag
966	lea	8(%rax),%rax
967	lea	.Lcbc_enc_popf(%rip),%r10
968	cmp	%r10,%rbx		# context->Rip<.Lcbc_enc_popf
969	jb	.Lin_cbc_no_flag
970	lea	-8(%rax),%rax
971	lea	.Lcbc_dec_pushf(%rip),%r10
972	cmp	%r10,%rbx		# context->Rip<=.Lcbc_dec_pushf
973	jbe	.Lin_cbc_no_flag
974	lea	8(%rax),%rax
975	lea	.Lcbc_dec_popf(%rip),%r10
976	cmp	%r10,%rbx		# context->Rip<.Lcbc_dec_popf
977	jb	.Lin_cbc_no_flag
978	lea	-8(%rax),%rax
979
980.Lin_cbc_no_flag:
981	mov	48(%rax),%rax		# $_rsp
982	lea	48(%rax),%rax
983
984.Lin_cbc_frame_setup:
985	mov	-8(%rax),%rbx
986	mov	-16(%rax),%rbp
987	mov	-24(%rax),%r12
988	mov	-32(%rax),%r13
989	mov	-40(%rax),%r14
990	mov	-48(%rax),%r15
991	mov	%rbx,144($context)	# restore context->Rbx
992	mov	%rbp,160($context)	# restore context->Rbp
993	mov	%r12,216($context)	# restore context->R12
994	mov	%r13,224($context)	# restore context->R13
995	mov	%r14,232($context)	# restore context->R14
996	mov	%r15,240($context)	# restore context->R15
997
998.Lin_cbc_prologue:
999	mov	8(%rax),%rdi
1000	mov	16(%rax),%rsi
1001	mov	%rax,152($context)	# restore context->Rsp
1002	mov	%rsi,168($context)	# restore context->Rsi
1003	mov	%rdi,176($context)	# restore context->Rdi
1004
1005.align	4
1006.Lcommon_seh_exit:
1007
1008	mov	40($disp),%rdi		# disp->ContextRecord
1009	mov	$context,%rsi		# context
1010	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
1011	.long	0xa548f3fc		# cld; rep movsq
1012
1013	mov	$disp,%rsi
1014	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1015	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1016	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1017	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1018	mov	40(%rsi),%r10		# disp->ContextRecord
1019	lea	56(%rsi),%r11		# &disp->HandlerData
1020	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1021	mov	%r10,32(%rsp)		# arg5
1022	mov	%r11,40(%rsp)		# arg6
1023	mov	%r12,48(%rsp)		# arg7
1024	mov	%rcx,56(%rsp)		# arg8, (NULL)
1025	call	*__imp_RtlVirtualUnwind(%rip)
1026
1027	mov	\$1,%eax		# ExceptionContinueSearch
1028	lea	64(%rsp),%rsp
1029	popfq
1030	pop	%r15
1031	pop	%r14
1032	pop	%r13
1033	pop	%r12
1034	pop	%rbp
1035	pop	%rbx
1036	pop	%rdi
1037	pop	%rsi
1038	ret
1039.size	cbc_se_handler,.-cbc_se_handler
1040
1041.section	.pdata
1042.align	4
1043	.rva	.LSEH_begin_Camellia_EncryptBlock_Rounds
1044	.rva	.LSEH_end_Camellia_EncryptBlock_Rounds
1045	.rva	.LSEH_info_Camellia_EncryptBlock_Rounds
1046
1047	.rva	.LSEH_begin_Camellia_DecryptBlock_Rounds
1048	.rva	.LSEH_end_Camellia_DecryptBlock_Rounds
1049	.rva	.LSEH_info_Camellia_DecryptBlock_Rounds
1050
1051	.rva	.LSEH_begin_Camellia_Ekeygen
1052	.rva	.LSEH_end_Camellia_Ekeygen
1053	.rva	.LSEH_info_Camellia_Ekeygen
1054
1055	.rva	.LSEH_begin_Camellia_cbc_encrypt
1056	.rva	.LSEH_end_Camellia_cbc_encrypt
1057	.rva	.LSEH_info_Camellia_cbc_encrypt
1058
1059.section	.xdata
1060.align	8
1061.LSEH_info_Camellia_EncryptBlock_Rounds:
1062	.byte	9,0,0,0
1063	.rva	common_se_handler
1064	.rva	.Lenc_prologue,.Lenc_epilogue	# HandlerData[]
1065.LSEH_info_Camellia_DecryptBlock_Rounds:
1066	.byte	9,0,0,0
1067	.rva	common_se_handler
1068	.rva	.Ldec_prologue,.Ldec_epilogue	# HandlerData[]
1069.LSEH_info_Camellia_Ekeygen:
1070	.byte	9,0,0,0
1071	.rva	common_se_handler
1072	.rva	.Lkey_prologue,.Lkey_epilogue	# HandlerData[]
1073.LSEH_info_Camellia_cbc_encrypt:
1074	.byte	9,0,0,0
1075	.rva	cbc_se_handler
1076___
1077}
1078
1079$code =~ s/\`([^\`]*)\`/eval $1/gem;
1080print $code;
1081close STDOUT;
1082