1238384Sjkim#!/usr/bin/env perl
2238384Sjkim
3238384Sjkim# ====================================================================
4238384Sjkim# Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
5238384Sjkim#
6238384Sjkim# This module may be used under the terms of either the GNU General
7238384Sjkim# Public License version 2 or later, the GNU Lesser General Public
8238384Sjkim# License version 2.1 or later, the Mozilla Public License version
9238384Sjkim# 1.1 or the BSD License. The exact terms of either license are
10238384Sjkim# distributed along with this module. For further details see
11238384Sjkim# http://www.openssl.org/~appro/camellia/.
12238384Sjkim# ====================================================================
13238384Sjkim
14238384Sjkim# Performance in cycles per processed byte (less is better) in
15238384Sjkim# 'openssl speed ...' benchmark:
16238384Sjkim#
17238384Sjkim#			AMD K8	Core2	PIII	P4
18238384Sjkim# -evp camellia-128-ecb	21.5	22.8	27.0	28.9
19238384Sjkim# + over gcc 3.4.6	+90/11% +70/10%	+53/4%	+160/64%
20238384Sjkim# + over icc 8.0	+48/19% +21/15%	+21/17%	+55/37%
21238384Sjkim#
22238384Sjkim# camellia-128-cbc	17.3	21.1	23.9	25.9
23238384Sjkim#
24238384Sjkim# 128-bit key setup	196	280	256	240	cycles/key
25238384Sjkim# + over gcc 3.4.6	+30/0%	+17/11%	+11/0%	+63/40%
26238384Sjkim# + over icc 8.0	+18/3%	+10/0%	+10/3%	+21/10%
27238384Sjkim#
28238384Sjkim# Pairs of numbers in "+" rows represent performance improvement over
29238384Sjkim# compiler generated position-independent code, PIC, and non-PIC
30238384Sjkim# respectively. PIC results are of greater relevance, as this module
31238384Sjkim# is position-independent, i.e. suitable for a shared library or PIE.
32238384Sjkim# Position independence "costs" one register, which is why compilers
33238384Sjkim# are so close with non-PIC results, they have an extra register to
34238384Sjkim# spare. CBC results are better than ECB ones thanks to "zero-copy"
35238384Sjkim# private _x86_* interface, and are ~30-40% better than with compiler
36238384Sjkim# generated cmll_cbc.o, and reach ~80-90% of x86_64 performance on
37238384Sjkim# same CPU (where applicable).
38238384Sjkim
39238384Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40238384Sjkimpush(@INC,"${dir}","${dir}../../perlasm");
41238384Sjkimrequire "x86asm.pl";
42238384Sjkim
43238384Sjkim$OPENSSL=1;
44238384Sjkim
45238384Sjkim&asm_init($ARGV[0],"cmll-586.pl",$ARGV[$#ARGV] eq "386");
46238384Sjkim
47238384Sjkim@T=("eax","ebx","ecx","edx");
48238384Sjkim$idx="esi";
49238384Sjkim$key="edi";
50238384Sjkim$Tbl="ebp";
51238384Sjkim
52238384Sjkim# stack frame layout in _x86_Camellia_* routines, frame is allocated
53238384Sjkim# by caller
54238384Sjkim$__ra=&DWP(0,"esp");	# return address
55238384Sjkim$__s0=&DWP(4,"esp");	# s0 backing store
56238384Sjkim$__s1=&DWP(8,"esp");	# s1 backing store
57238384Sjkim$__s2=&DWP(12,"esp");	# s2 backing store
58238384Sjkim$__s3=&DWP(16,"esp");	# s3 backing store
59238384Sjkim$__end=&DWP(20,"esp");	# pointer to end/start of key schedule
60238384Sjkim
61238384Sjkim# stack frame layout in Camellia_[en|crypt] routines, which differs from
62238384Sjkim# above by 4 and overlaps by pointer to end/start of key schedule
63238384Sjkim$_end=&DWP(16,"esp");
64238384Sjkim$_esp=&DWP(20,"esp");
65238384Sjkim
66238384Sjkim# const unsigned int Camellia_SBOX[4][256];
67238384Sjkim# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
68238384Sjkim# and [2][] - with [3][]. This is done to optimize code size.
69238384Sjkim$SBOX1_1110=0;		# Camellia_SBOX[0]
70238384Sjkim$SBOX4_4404=4;		# Camellia_SBOX[1]
71238384Sjkim$SBOX2_0222=2048;	# Camellia_SBOX[2]
72238384Sjkim$SBOX3_3033=2052;	# Camellia_SBOX[3]
73238384Sjkim&static_label("Camellia_SIGMA");
74238384Sjkim&static_label("Camellia_SBOX");
75238384Sjkim
76238384Sjkimsub Camellia_Feistel {
77238384Sjkimmy $i=@_[0];
78238384Sjkimmy $seed=defined(@_[1])?@_[1]:0;
79238384Sjkimmy $scale=$seed<0?-8:8;
80238384Sjkimmy $frame=defined(@_[2])?@_[2]:0;
81238384Sjkimmy $j=($i&1)*2;
82238384Sjkimmy $t0=@T[($j)%4],$t1=@T[($j+1)%4],$t2=@T[($j+2)%4],$t3=@T[($j+3)%4];
83238384Sjkim
84238384Sjkim	&xor	($t0,$idx);				# t0^=key[0]
85238384Sjkim	&xor	($t1,&DWP($seed+$i*$scale+4,$key));	# t1^=key[1]
86238384Sjkim	&movz	($idx,&HB($t0));			# (t0>>8)&0xff
87238384Sjkim	&mov	($t3,&DWP($SBOX3_3033,$Tbl,$idx,8));	# t3=SBOX3_3033[0]
88238384Sjkim	&movz	($idx,&LB($t0));			# (t0>>0)&0xff
89238384Sjkim	&xor	($t3,&DWP($SBOX4_4404,$Tbl,$idx,8));	# t3^=SBOX4_4404[0]
90238384Sjkim	&shr	($t0,16);
91238384Sjkim	&movz	($idx,&LB($t1));			# (t1>>0)&0xff
92238384Sjkim	&mov	($t2,&DWP($SBOX1_1110,$Tbl,$idx,8));	# t2=SBOX1_1110[1]
93238384Sjkim	&movz	($idx,&HB($t0));			# (t0>>24)&0xff
94238384Sjkim	&xor	($t3,&DWP($SBOX1_1110,$Tbl,$idx,8));	# t3^=SBOX1_1110[0]
95238384Sjkim	&movz	($idx,&HB($t1));			# (t1>>8)&0xff
96238384Sjkim	&xor	($t2,&DWP($SBOX4_4404,$Tbl,$idx,8));	# t2^=SBOX4_4404[1]
97238384Sjkim	&shr	($t1,16);
98238384Sjkim	&movz	($t0,&LB($t0));				# (t0>>16)&0xff
99238384Sjkim	&xor	($t3,&DWP($SBOX2_0222,$Tbl,$t0,8));	# t3^=SBOX2_0222[0]
100238384Sjkim	&movz	($idx,&HB($t1));			# (t1>>24)&0xff
101238384Sjkim	&mov	($t0,&DWP($frame+4*(($j+3)%4),"esp"));	# prefetch "s3"
102238384Sjkim	&xor	($t2,$t3);				# t2^=t3
103238384Sjkim	&rotr	($t3,8);				# t3=RightRotate(t3,8)
104238384Sjkim	&xor	($t2,&DWP($SBOX2_0222,$Tbl,$idx,8));	# t2^=SBOX2_0222[1]
105238384Sjkim	&movz	($idx,&LB($t1));			# (t1>>16)&0xff
106238384Sjkim	&mov	($t1,&DWP($frame+4*(($j+2)%4),"esp"));	# prefetch "s2"
107238384Sjkim	&xor	($t3,$t0);				# t3^=s3
108238384Sjkim	&xor	($t2,&DWP($SBOX3_3033,$Tbl,$idx,8));	# t2^=SBOX3_3033[1]
109238384Sjkim	&mov	($idx,&DWP($seed+($i+1)*$scale,$key));	# prefetch key[i+1]
110238384Sjkim	&xor	($t3,$t2);				# t3^=t2
111238384Sjkim	&mov	(&DWP($frame+4*(($j+3)%4),"esp"),$t3);	# s3=t3
112238384Sjkim	&xor	($t2,$t1);				# t2^=s2
113238384Sjkim	&mov	(&DWP($frame+4*(($j+2)%4),"esp"),$t2);	# s2=t2
114238384Sjkim}
115238384Sjkim
116238384Sjkim# void Camellia_EncryptBlock_Rounds(
117238384Sjkim#		int grandRounds,
118238384Sjkim#		const Byte plaintext[],
119238384Sjkim#		const KEY_TABLE_TYPE keyTable,
120238384Sjkim#		Byte ciphertext[])
121238384Sjkim&function_begin("Camellia_EncryptBlock_Rounds");
122238384Sjkim	&mov	("eax",&wparam(0));	# load grandRounds
123238384Sjkim	&mov	($idx,&wparam(1));	# load plaintext pointer
124238384Sjkim	&mov	($key,&wparam(2));	# load key schedule pointer
125238384Sjkim
126238384Sjkim	&mov	("ebx","esp");
127238384Sjkim	&sub	("esp",7*4);		# place for s[0-3],keyEnd,esp and ra
128238384Sjkim	&and	("esp",-64);
129238384Sjkim
130238384Sjkim	# place stack frame just "above mod 1024" the key schedule
131238384Sjkim	# this ensures that cache associativity of 2 suffices
132238384Sjkim	&lea	("ecx",&DWP(-64-63,$key));
133238384Sjkim	&sub	("ecx","esp");
134238384Sjkim	&neg	("ecx");
135238384Sjkim	&and	("ecx",0x3C0);	# modulo 1024, but aligned to cache-line
136238384Sjkim	&sub	("esp","ecx");
137238384Sjkim	&add	("esp",4);	# 4 is reserved for callee's return address
138238384Sjkim
139238384Sjkim	&shl	("eax",6);
140238384Sjkim	&lea	("eax",&DWP(0,$key,"eax"));
141238384Sjkim	&mov	($_esp,"ebx");	# save %esp
142238384Sjkim	&mov	($_end,"eax");	# save keyEnd
143238384Sjkim
144238384Sjkim	&call	(&label("pic_point"));
145238384Sjkim	&set_label("pic_point");
146238384Sjkim	&blindpop($Tbl);
147238384Sjkim	&lea	($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
148238384Sjkim
149238384Sjkim	&mov	(@T[0],&DWP(0,$idx));	# load plaintext
150238384Sjkim	&mov	(@T[1],&DWP(4,$idx));
151238384Sjkim	&mov	(@T[2],&DWP(8,$idx));
152238384Sjkim	&bswap	(@T[0]);
153238384Sjkim	&mov	(@T[3],&DWP(12,$idx));
154238384Sjkim	&bswap	(@T[1]);
155238384Sjkim	&bswap	(@T[2]);
156238384Sjkim	&bswap	(@T[3]);
157238384Sjkim
158238384Sjkim	&call	("_x86_Camellia_encrypt");
159238384Sjkim
160238384Sjkim	&mov	("esp",$_esp);
161238384Sjkim	&bswap	(@T[0]);
162238384Sjkim	&mov	($idx,&wparam(3));	# load ciphertext pointer
163238384Sjkim	&bswap	(@T[1]);
164238384Sjkim	&bswap	(@T[2]);
165238384Sjkim	&bswap	(@T[3]);
166238384Sjkim	&mov	(&DWP(0,$idx),@T[0]);	# write ciphertext
167238384Sjkim	&mov	(&DWP(4,$idx),@T[1]);
168238384Sjkim	&mov	(&DWP(8,$idx),@T[2]);
169238384Sjkim	&mov	(&DWP(12,$idx),@T[3]);
170238384Sjkim&function_end("Camellia_EncryptBlock_Rounds");
171238384Sjkim# V1.x API
172238384Sjkim&function_begin_B("Camellia_EncryptBlock");
173238384Sjkim	&mov	("eax",128);
174238384Sjkim	&sub	("eax",&wparam(0));	# load keyBitLength
175238384Sjkim	&mov	("eax",3);
176238384Sjkim	&adc	("eax",0);		# keyBitLength==128?3:4
177238384Sjkim	&mov	(&wparam(0),"eax");
178238384Sjkim	&jmp	(&label("Camellia_EncryptBlock_Rounds"));
179238384Sjkim&function_end_B("Camellia_EncryptBlock");
180238384Sjkim
181238384Sjkimif ($OPENSSL) {
182238384Sjkim# void Camellia_encrypt(
183238384Sjkim#		const unsigned char *in,
184238384Sjkim#		unsigned char *out,
185238384Sjkim#		const CAMELLIA_KEY *key)
186238384Sjkim&function_begin("Camellia_encrypt");
187238384Sjkim	&mov	($idx,&wparam(0));	# load plaintext pointer
188238384Sjkim	&mov	($key,&wparam(2));	# load key schedule pointer
189238384Sjkim
190238384Sjkim	&mov	("ebx","esp");
191238384Sjkim	&sub	("esp",7*4);		# place for s[0-3],keyEnd,esp and ra
192238384Sjkim	&and	("esp",-64);
193238384Sjkim	&mov	("eax",&DWP(272,$key));	# load grandRounds counter
194238384Sjkim
195238384Sjkim	# place stack frame just "above mod 1024" the key schedule
196238384Sjkim	# this ensures that cache associativity of 2 suffices
197238384Sjkim	&lea	("ecx",&DWP(-64-63,$key));
198238384Sjkim	&sub	("ecx","esp");
199238384Sjkim	&neg	("ecx");
200238384Sjkim	&and	("ecx",0x3C0);	# modulo 1024, but aligned to cache-line
201238384Sjkim	&sub	("esp","ecx");
202238384Sjkim	&add	("esp",4);	# 4 is reserved for callee's return address
203238384Sjkim
204238384Sjkim	&shl	("eax",6);
205238384Sjkim	&lea	("eax",&DWP(0,$key,"eax"));
206238384Sjkim	&mov	($_esp,"ebx");	# save %esp
207238384Sjkim	&mov	($_end,"eax");	# save keyEnd
208238384Sjkim
209238384Sjkim	&call	(&label("pic_point"));
210238384Sjkim	&set_label("pic_point");
211238384Sjkim	&blindpop($Tbl);
212238384Sjkim	&lea	($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
213238384Sjkim
214238384Sjkim	&mov	(@T[0],&DWP(0,$idx));	# load plaintext
215238384Sjkim	&mov	(@T[1],&DWP(4,$idx));
216238384Sjkim	&mov	(@T[2],&DWP(8,$idx));
217238384Sjkim	&bswap	(@T[0]);
218238384Sjkim	&mov	(@T[3],&DWP(12,$idx));
219238384Sjkim	&bswap	(@T[1]);
220238384Sjkim	&bswap	(@T[2]);
221238384Sjkim	&bswap	(@T[3]);
222238384Sjkim
223238384Sjkim	&call	("_x86_Camellia_encrypt");
224238384Sjkim
225238384Sjkim	&mov	("esp",$_esp);
226238384Sjkim	&bswap	(@T[0]);
227238384Sjkim	&mov	($idx,&wparam(1));	# load ciphertext pointer
228238384Sjkim	&bswap	(@T[1]);
229238384Sjkim	&bswap	(@T[2]);
230238384Sjkim	&bswap	(@T[3]);
231238384Sjkim	&mov	(&DWP(0,$idx),@T[0]);	# write ciphertext
232238384Sjkim	&mov	(&DWP(4,$idx),@T[1]);
233238384Sjkim	&mov	(&DWP(8,$idx),@T[2]);
234238384Sjkim	&mov	(&DWP(12,$idx),@T[3]);
235238384Sjkim&function_end("Camellia_encrypt");
236238384Sjkim}
237238384Sjkim
238238384Sjkim&function_begin_B("_x86_Camellia_encrypt");
239238384Sjkim	&xor	(@T[0],&DWP(0,$key));	# ^=key[0-3]
240238384Sjkim	&xor	(@T[1],&DWP(4,$key));
241238384Sjkim	&xor	(@T[2],&DWP(8,$key));
242238384Sjkim	&xor	(@T[3],&DWP(12,$key));
243238384Sjkim	&mov	($idx,&DWP(16,$key));	# prefetch key[4]
244238384Sjkim
245238384Sjkim	&mov	($__s0,@T[0]);		# save s[0-3]
246238384Sjkim	&mov	($__s1,@T[1]);
247238384Sjkim	&mov	($__s2,@T[2]);
248238384Sjkim	&mov	($__s3,@T[3]);
249238384Sjkim
250238384Sjkim&set_label("loop",16);
251238384Sjkim	for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16,4); }
252238384Sjkim
253238384Sjkim	&add	($key,16*4);
254238384Sjkim	&cmp	($key,$__end);
255238384Sjkim	&je	(&label("done"));
256238384Sjkim
257238384Sjkim	# @T[0-1] are preloaded, $idx is preloaded with key[0]
258238384Sjkim	&and	($idx,@T[0]);
259238384Sjkim	 &mov	 (@T[3],$__s3);
260238384Sjkim	&rotl	($idx,1);
261238384Sjkim	 &mov	 (@T[2],@T[3]);
262238384Sjkim	&xor	(@T[1],$idx);
263238384Sjkim	 &or	 (@T[2],&DWP(12,$key));
264238384Sjkim	&mov	($__s1,@T[1]);		# s1^=LeftRotate(s0&key[0],1);
265238384Sjkim	 &xor	 (@T[2],$__s2);
266238384Sjkim
267238384Sjkim	&mov	($idx,&DWP(4,$key));
268238384Sjkim	 &mov	 ($__s2,@T[2]);		# s2^=s3|key[3];
269238384Sjkim	&or	($idx,@T[1]);
270238384Sjkim	 &and	 (@T[2],&DWP(8,$key));
271238384Sjkim	&xor	(@T[0],$idx);
272238384Sjkim	 &rotl	 (@T[2],1);
273238384Sjkim	&mov	($__s0,@T[0]);		# s0^=s1|key[1];
274238384Sjkim	 &xor	 (@T[3],@T[2]);
275238384Sjkim	&mov	($idx,&DWP(16,$key));		# prefetch key[4]
276238384Sjkim	 &mov	 ($__s3,@T[3]);		# s3^=LeftRotate(s2&key[2],1);
277238384Sjkim	&jmp	(&label("loop"));
278238384Sjkim
279238384Sjkim&set_label("done",8);
280238384Sjkim	&mov	(@T[2],@T[0]);		# SwapHalf
281238384Sjkim	&mov	(@T[3],@T[1]);
282238384Sjkim	&mov	(@T[0],$__s2);
283238384Sjkim	&mov	(@T[1],$__s3);
284238384Sjkim	&xor	(@T[0],$idx);		# $idx is preloaded with key[0]
285238384Sjkim	&xor	(@T[1],&DWP(4,$key));
286238384Sjkim	&xor	(@T[2],&DWP(8,$key));
287238384Sjkim	&xor	(@T[3],&DWP(12,$key));
288238384Sjkim	&ret	();
289238384Sjkim&function_end_B("_x86_Camellia_encrypt");
290238384Sjkim
291238384Sjkim# void Camellia_DecryptBlock_Rounds(
292238384Sjkim#		int grandRounds,
293238384Sjkim#		const Byte ciphertext[],
294238384Sjkim#		const KEY_TABLE_TYPE keyTable,
295238384Sjkim#		Byte plaintext[])
296238384Sjkim&function_begin("Camellia_DecryptBlock_Rounds");
297238384Sjkim	&mov	("eax",&wparam(0));	# load grandRounds
298238384Sjkim	&mov	($idx,&wparam(1));	# load ciphertext pointer
299238384Sjkim	&mov	($key,&wparam(2));	# load key schedule pointer
300238384Sjkim
301238384Sjkim	&mov	("ebx","esp");
302238384Sjkim	&sub	("esp",7*4);		# place for s[0-3],keyEnd,esp and ra
303238384Sjkim	&and	("esp",-64);
304238384Sjkim
305238384Sjkim	# place stack frame just "above mod 1024" the key schedule
306238384Sjkim	# this ensures that cache associativity of 2 suffices
307238384Sjkim	&lea	("ecx",&DWP(-64-63,$key));
308238384Sjkim	&sub	("ecx","esp");
309238384Sjkim	&neg	("ecx");
310238384Sjkim	&and	("ecx",0x3C0);	# modulo 1024, but aligned to cache-line
311238384Sjkim	&sub	("esp","ecx");
312238384Sjkim	&add	("esp",4);	# 4 is reserved for callee's return address
313238384Sjkim
314238384Sjkim	&shl	("eax",6);
315238384Sjkim	&mov	(&DWP(4*4,"esp"),$key);	# save keyStart
316238384Sjkim	&lea	($key,&DWP(0,$key,"eax"));
317238384Sjkim	&mov	(&DWP(5*4,"esp"),"ebx");# save %esp
318238384Sjkim
319238384Sjkim	&call	(&label("pic_point"));
320238384Sjkim	&set_label("pic_point");
321238384Sjkim	&blindpop($Tbl);
322238384Sjkim	&lea	($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
323238384Sjkim
324238384Sjkim	&mov	(@T[0],&DWP(0,$idx));	# load ciphertext
325238384Sjkim	&mov	(@T[1],&DWP(4,$idx));
326238384Sjkim	&mov	(@T[2],&DWP(8,$idx));
327238384Sjkim	&bswap	(@T[0]);
328238384Sjkim	&mov	(@T[3],&DWP(12,$idx));
329238384Sjkim	&bswap	(@T[1]);
330238384Sjkim	&bswap	(@T[2]);
331238384Sjkim	&bswap	(@T[3]);
332238384Sjkim
333238384Sjkim	&call	("_x86_Camellia_decrypt");
334238384Sjkim
335238384Sjkim	&mov	("esp",&DWP(5*4,"esp"));
336238384Sjkim	&bswap	(@T[0]);
337238384Sjkim	&mov	($idx,&wparam(3));	# load plaintext pointer
338238384Sjkim	&bswap	(@T[1]);
339238384Sjkim	&bswap	(@T[2]);
340238384Sjkim	&bswap	(@T[3]);
341238384Sjkim	&mov	(&DWP(0,$idx),@T[0]);	# write plaintext
342238384Sjkim	&mov	(&DWP(4,$idx),@T[1]);
343238384Sjkim	&mov	(&DWP(8,$idx),@T[2]);
344238384Sjkim	&mov	(&DWP(12,$idx),@T[3]);
345238384Sjkim&function_end("Camellia_DecryptBlock_Rounds");
346238384Sjkim# V1.x API
347238384Sjkim&function_begin_B("Camellia_DecryptBlock");
348238384Sjkim	&mov	("eax",128);
349238384Sjkim	&sub	("eax",&wparam(0));	# load keyBitLength
350238384Sjkim	&mov	("eax",3);
351238384Sjkim	&adc	("eax",0);		# keyBitLength==128?3:4
352238384Sjkim	&mov	(&wparam(0),"eax");
353238384Sjkim	&jmp	(&label("Camellia_DecryptBlock_Rounds"));
354238384Sjkim&function_end_B("Camellia_DecryptBlock");
355238384Sjkim
356238384Sjkimif ($OPENSSL) {
357238384Sjkim# void Camellia_decrypt(
358238384Sjkim#		const unsigned char *in,
359238384Sjkim#		unsigned char *out,
360238384Sjkim#		const CAMELLIA_KEY *key)
361238384Sjkim&function_begin("Camellia_decrypt");
362238384Sjkim	&mov	($idx,&wparam(0));	# load ciphertext pointer
363238384Sjkim	&mov	($key,&wparam(2));	# load key schedule pointer
364238384Sjkim
365238384Sjkim	&mov	("ebx","esp");
366238384Sjkim	&sub	("esp",7*4);		# place for s[0-3],keyEnd,esp and ra
367238384Sjkim	&and	("esp",-64);
368238384Sjkim	&mov	("eax",&DWP(272,$key));	# load grandRounds counter
369238384Sjkim
370238384Sjkim	# place stack frame just "above mod 1024" the key schedule
371238384Sjkim	# this ensures that cache associativity of 2 suffices
372238384Sjkim	&lea	("ecx",&DWP(-64-63,$key));
373238384Sjkim	&sub	("ecx","esp");
374238384Sjkim	&neg	("ecx");
375238384Sjkim	&and	("ecx",0x3C0);	# modulo 1024, but aligned to cache-line
376238384Sjkim	&sub	("esp","ecx");
377238384Sjkim	&add	("esp",4);	# 4 is reserved for callee's return address
378238384Sjkim
379238384Sjkim	&shl	("eax",6);
380238384Sjkim	&mov	(&DWP(4*4,"esp"),$key);	# save keyStart
381238384Sjkim	&lea	($key,&DWP(0,$key,"eax"));
382238384Sjkim	&mov	(&DWP(5*4,"esp"),"ebx");# save %esp
383238384Sjkim
384238384Sjkim	&call	(&label("pic_point"));
385238384Sjkim	&set_label("pic_point");
386238384Sjkim	&blindpop($Tbl);
387238384Sjkim	&lea	($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
388238384Sjkim
389238384Sjkim	&mov	(@T[0],&DWP(0,$idx));	# load ciphertext
390238384Sjkim	&mov	(@T[1],&DWP(4,$idx));
391238384Sjkim	&mov	(@T[2],&DWP(8,$idx));
392238384Sjkim	&bswap	(@T[0]);
393238384Sjkim	&mov	(@T[3],&DWP(12,$idx));
394238384Sjkim	&bswap	(@T[1]);
395238384Sjkim	&bswap	(@T[2]);
396238384Sjkim	&bswap	(@T[3]);
397238384Sjkim
398238384Sjkim	&call	("_x86_Camellia_decrypt");
399238384Sjkim
400238384Sjkim	&mov	("esp",&DWP(5*4,"esp"));
401238384Sjkim	&bswap	(@T[0]);
402238384Sjkim	&mov	($idx,&wparam(1));	# load plaintext pointer
403238384Sjkim	&bswap	(@T[1]);
404238384Sjkim	&bswap	(@T[2]);
405238384Sjkim	&bswap	(@T[3]);
406238384Sjkim	&mov	(&DWP(0,$idx),@T[0]);	# write plaintext
407238384Sjkim	&mov	(&DWP(4,$idx),@T[1]);
408238384Sjkim	&mov	(&DWP(8,$idx),@T[2]);
409238384Sjkim	&mov	(&DWP(12,$idx),@T[3]);
410238384Sjkim&function_end("Camellia_decrypt");
411238384Sjkim}
412238384Sjkim
413238384Sjkim&function_begin_B("_x86_Camellia_decrypt");
414238384Sjkim	&xor	(@T[0],&DWP(0,$key));	# ^=key[0-3]
415238384Sjkim	&xor	(@T[1],&DWP(4,$key));
416238384Sjkim	&xor	(@T[2],&DWP(8,$key));
417238384Sjkim	&xor	(@T[3],&DWP(12,$key));
418238384Sjkim	&mov	($idx,&DWP(-8,$key));	# prefetch key[-2]
419238384Sjkim
420238384Sjkim	&mov	($__s0,@T[0]);		# save s[0-3]
421238384Sjkim	&mov	($__s1,@T[1]);
422238384Sjkim	&mov	($__s2,@T[2]);
423238384Sjkim	&mov	($__s3,@T[3]);
424238384Sjkim
425238384Sjkim&set_label("loop",16);
426238384Sjkim	for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8,4); }
427238384Sjkim
428238384Sjkim	&sub	($key,16*4);
429238384Sjkim	&cmp	($key,$__end);
430238384Sjkim	&je	(&label("done"));
431238384Sjkim
432238384Sjkim	# @T[0-1] are preloaded, $idx is preloaded with key[2]
433238384Sjkim	&and	($idx,@T[0]);
434238384Sjkim	 &mov	 (@T[3],$__s3);
435238384Sjkim	&rotl	($idx,1);
436238384Sjkim	 &mov	 (@T[2],@T[3]);
437238384Sjkim	&xor	(@T[1],$idx);
438238384Sjkim	 &or	 (@T[2],&DWP(4,$key));
439238384Sjkim	&mov	($__s1,@T[1]);		# s1^=LeftRotate(s0&key[0],1);
440238384Sjkim	 &xor	 (@T[2],$__s2);
441238384Sjkim
442238384Sjkim	&mov	($idx,&DWP(12,$key));
443238384Sjkim	 &mov	 ($__s2,@T[2]);		# s2^=s3|key[3];
444238384Sjkim	&or	($idx,@T[1]);
445238384Sjkim	 &and	 (@T[2],&DWP(0,$key));
446238384Sjkim	&xor	(@T[0],$idx);
447238384Sjkim	 &rotl	 (@T[2],1);
448238384Sjkim	&mov	($__s0,@T[0]);		# s0^=s1|key[1];
449238384Sjkim	 &xor	 (@T[3],@T[2]);
450238384Sjkim	&mov	($idx,&DWP(-8,$key));	# prefetch key[4]
451238384Sjkim	 &mov	 ($__s3,@T[3]);		# s3^=LeftRotate(s2&key[2],1);
452238384Sjkim	&jmp	(&label("loop"));
453238384Sjkim
454238384Sjkim&set_label("done",8);
455238384Sjkim	&mov	(@T[2],@T[0]);		# SwapHalf
456238384Sjkim	&mov	(@T[3],@T[1]);
457238384Sjkim	&mov	(@T[0],$__s2);
458238384Sjkim	&mov	(@T[1],$__s3);
459238384Sjkim	&xor	(@T[2],$idx);		# $idx is preloaded with key[2]
460238384Sjkim	&xor	(@T[3],&DWP(12,$key));
461238384Sjkim	&xor	(@T[0],&DWP(0,$key));
462238384Sjkim	&xor	(@T[1],&DWP(4,$key));
463238384Sjkim	&ret	();
464238384Sjkim&function_end_B("_x86_Camellia_decrypt");
465238384Sjkim
466238384Sjkim# shld is very slow on Intel P4 family. Even on AMD it limits
467238384Sjkim# instruction decode rate [because it's VectorPath] and consequently
468238384Sjkim# performance. PIII, PM and Core[2] seem to be the only ones which
469238384Sjkim# execute this code ~7% faster...
470238384Sjkimsub __rotl128 {
471238384Sjkim  my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
472238384Sjkim
473238384Sjkim    $rnd *= 2;
474238384Sjkim    if ($rot) {
475238384Sjkim	&mov	($idx,$i0);
476238384Sjkim	&shld	($i0,$i1,$rot);
477238384Sjkim	&shld	($i1,$i2,$rot);
478238384Sjkim	&shld	($i2,$i3,$rot);
479238384Sjkim	&shld	($i3,$idx,$rot);
480238384Sjkim    }
481238384Sjkim    &mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i0 eq @T[0]);
482238384Sjkim    &mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i1 eq @T[0]);
483238384Sjkim    &mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i2 eq @T[0]);
484238384Sjkim    &mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i3 eq @T[0]);
485238384Sjkim}
486238384Sjkim
487238384Sjkim# ... Implementing 128-bit rotate without shld gives >3x performance
488238384Sjkim# improvement on P4, only ~7% degradation on other Intel CPUs and
489238384Sjkim# not worse performance on AMD. This is therefore preferred.
490238384Sjkimsub _rotl128 {
491238384Sjkim  my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
492238384Sjkim
493238384Sjkim    $rnd *= 2;
494238384Sjkim    if ($rot) {
495238384Sjkim	&mov	($Tbl,$i0);
496238384Sjkim	&shl	($i0,$rot);
497238384Sjkim	&mov	($idx,$i1);
498238384Sjkim	&shr	($idx,32-$rot);
499238384Sjkim	&shl	($i1,$rot);
500238384Sjkim	&or	($i0,$idx);
501238384Sjkim	&mov	($idx,$i2);
502238384Sjkim	&shl	($i2,$rot);
503238384Sjkim	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i0 eq @T[0]);
504238384Sjkim	&shr	($idx,32-$rot);
505238384Sjkim	&or	($i1,$idx);
506238384Sjkim	&shr	($Tbl,32-$rot);
507238384Sjkim	&mov	($idx,$i3);
508238384Sjkim	&shr	($idx,32-$rot);
509238384Sjkim	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i1 eq @T[0]);
510238384Sjkim	&shl	($i3,$rot);
511238384Sjkim	&or	($i2,$idx);
512238384Sjkim	&or	($i3,$Tbl);
513238384Sjkim	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i2 eq @T[0]);
514238384Sjkim	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i3 eq @T[0]);
515238384Sjkim    } else {
516238384Sjkim	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i0 eq @T[0]);
517238384Sjkim	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i1 eq @T[0]);
518238384Sjkim	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i2 eq @T[0]);
519238384Sjkim	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i3 eq @T[0]);
520238384Sjkim    }
521238384Sjkim}
522238384Sjkim
523238384Sjkimsub _saveround {
524238384Sjkimmy ($rnd,$key,@T)=@_;
525238384Sjkimmy $bias=int(@T[0])?shift(@T):0;
526238384Sjkim
527238384Sjkim	&mov	(&DWP($bias+$rnd*8+0,$key),@T[0]);
528238384Sjkim	&mov	(&DWP($bias+$rnd*8+4,$key),@T[1])	if ($#T>=1);
529238384Sjkim	&mov	(&DWP($bias+$rnd*8+8,$key),@T[2])	if ($#T>=2);
530238384Sjkim	&mov	(&DWP($bias+$rnd*8+12,$key),@T[3])	if ($#T>=3);
531238384Sjkim}
532238384Sjkim
533238384Sjkimsub _loadround {
534238384Sjkimmy ($rnd,$key,@T)=@_;
535238384Sjkimmy $bias=int(@T[0])?shift(@T):0;
536238384Sjkim
537238384Sjkim	&mov	(@T[0],&DWP($bias+$rnd*8+0,$key));
538238384Sjkim	&mov	(@T[1],&DWP($bias+$rnd*8+4,$key))	if ($#T>=1);
539238384Sjkim	&mov	(@T[2],&DWP($bias+$rnd*8+8,$key))	if ($#T>=2);
540238384Sjkim	&mov	(@T[3],&DWP($bias+$rnd*8+12,$key))	if ($#T>=3);
541238384Sjkim}
542238384Sjkim
543238384Sjkim# void Camellia_Ekeygen(
544238384Sjkim#		const int keyBitLength,
545238384Sjkim#		const Byte *rawKey,
546238384Sjkim#		KEY_TABLE_TYPE keyTable)
547238384Sjkim&function_begin("Camellia_Ekeygen");
548238384Sjkim{ my $step=0;
549238384Sjkim
550238384Sjkim	&stack_push(4);				# place for s[0-3]
551238384Sjkim
552238384Sjkim	&mov	($Tbl,&wparam(0));		# load arguments
553238384Sjkim	&mov	($idx,&wparam(1));
554238384Sjkim	&mov	($key,&wparam(2));
555238384Sjkim
556238384Sjkim	&mov	(@T[0],&DWP(0,$idx));		# load 0-127 bits
557238384Sjkim	&mov	(@T[1],&DWP(4,$idx));
558238384Sjkim	&mov	(@T[2],&DWP(8,$idx));
559238384Sjkim	&mov	(@T[3],&DWP(12,$idx));
560238384Sjkim
561238384Sjkim	&bswap	(@T[0]);
562238384Sjkim	&bswap	(@T[1]);
563238384Sjkim	&bswap	(@T[2]);
564238384Sjkim	&bswap	(@T[3]);
565238384Sjkim
566238384Sjkim	&_saveround	(0,$key,@T);		# KL<<<0
567238384Sjkim
568238384Sjkim	&cmp	($Tbl,128);
569238384Sjkim	&je	(&label("1st128"));
570238384Sjkim
571238384Sjkim	&mov	(@T[0],&DWP(16,$idx));		# load 128-191 bits
572238384Sjkim	&mov	(@T[1],&DWP(20,$idx));
573238384Sjkim	&cmp	($Tbl,192);
574238384Sjkim	&je	(&label("1st192"));
575238384Sjkim	&mov	(@T[2],&DWP(24,$idx));		# load 192-255 bits
576238384Sjkim	&mov	(@T[3],&DWP(28,$idx));
577238384Sjkim	&jmp	(&label("1st256"));
578238384Sjkim&set_label("1st192",4);
579238384Sjkim	&mov	(@T[2],@T[0]);
580238384Sjkim	&mov	(@T[3],@T[1]);
581238384Sjkim	&not	(@T[2]);
582238384Sjkim	&not	(@T[3]);
583238384Sjkim&set_label("1st256",4);
584238384Sjkim	&bswap	(@T[0]);
585238384Sjkim	&bswap	(@T[1]);
586238384Sjkim	&bswap	(@T[2]);
587238384Sjkim	&bswap	(@T[3]);
588238384Sjkim
589238384Sjkim	&_saveround	(4,$key,@T);		# temporary storage for KR!
590238384Sjkim
591238384Sjkim	&xor	(@T[0],&DWP(0*8+0,$key));	# KR^KL
592238384Sjkim	&xor	(@T[1],&DWP(0*8+4,$key));
593238384Sjkim	&xor	(@T[2],&DWP(1*8+0,$key));
594238384Sjkim	&xor	(@T[3],&DWP(1*8+4,$key));
595238384Sjkim
596238384Sjkim&set_label("1st128",4);
597238384Sjkim	&call	(&label("pic_point"));
598238384Sjkim	&set_label("pic_point");
599238384Sjkim	&blindpop($Tbl);
600238384Sjkim	&lea	($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
601238384Sjkim	&lea	($key,&DWP(&label("Camellia_SIGMA")."-".&label("Camellia_SBOX"),$Tbl));
602238384Sjkim
603238384Sjkim	&mov	($idx,&DWP($step*8,$key));	# prefetch SIGMA[0]
604238384Sjkim	&mov	(&swtmp(0),@T[0]);		# save s[0-3]
605238384Sjkim	&mov	(&swtmp(1),@T[1]);
606238384Sjkim	&mov	(&swtmp(2),@T[2]);
607238384Sjkim	&mov	(&swtmp(3),@T[3]);
608238384Sjkim	&Camellia_Feistel($step++);
609238384Sjkim	&Camellia_Feistel($step++);
610238384Sjkim	&mov	(@T[2],&swtmp(2));
611238384Sjkim	&mov	(@T[3],&swtmp(3));
612238384Sjkim
613238384Sjkim	&mov	($idx,&wparam(2));
614238384Sjkim	&xor	(@T[0],&DWP(0*8+0,$idx));	# ^KL
615238384Sjkim	&xor	(@T[1],&DWP(0*8+4,$idx));
616238384Sjkim	&xor	(@T[2],&DWP(1*8+0,$idx));
617238384Sjkim	&xor	(@T[3],&DWP(1*8+4,$idx));
618238384Sjkim
619238384Sjkim	&mov	($idx,&DWP($step*8,$key));	# prefetch SIGMA[4]
620238384Sjkim	&mov	(&swtmp(0),@T[0]);		# save s[0-3]
621238384Sjkim	&mov	(&swtmp(1),@T[1]);
622238384Sjkim	&mov	(&swtmp(2),@T[2]);
623238384Sjkim	&mov	(&swtmp(3),@T[3]);
624238384Sjkim	&Camellia_Feistel($step++);
625238384Sjkim	&Camellia_Feistel($step++);
626238384Sjkim	&mov	(@T[2],&swtmp(2));
627238384Sjkim	&mov	(@T[3],&swtmp(3));
628238384Sjkim
629238384Sjkim	&mov	($idx,&wparam(0));
630238384Sjkim	&cmp	($idx,128);
631238384Sjkim	&jne	(&label("2nd256"));
632238384Sjkim
633238384Sjkim	&mov	($key,&wparam(2));
634238384Sjkim	&lea	($key,&DWP(128,$key));		# size optimization
635238384Sjkim
636238384Sjkim	####### process KA
637238384Sjkim	&_saveround	(2,$key,-128,@T);	# KA<<<0
638238384Sjkim	&_rotl128	(@T,15,6,@T);		# KA<<<15
639238384Sjkim	&_rotl128	(@T,15,8,@T);		# KA<<<(15+15=30)
640238384Sjkim	&_rotl128	(@T,15,12,@T[0],@T[1]);	# KA<<<(30+15=45)
641238384Sjkim	&_rotl128	(@T,15,14,@T);		# KA<<<(45+15=60)
642238384Sjkim	push		(@T,shift(@T));		# rotl128(@T,32);
643238384Sjkim	&_rotl128	(@T,2,20,@T);		# KA<<<(60+32+2=94)
644238384Sjkim	&_rotl128	(@T,17,24,@T);		# KA<<<(94+17=111)
645238384Sjkim
646238384Sjkim	####### process KL
647238384Sjkim	&_loadround	(0,$key,-128,@T);	# load KL
648238384Sjkim	&_rotl128	(@T,15,4,@T);		# KL<<<15
649238384Sjkim	&_rotl128	(@T,30,10,@T);		# KL<<<(15+30=45)
650238384Sjkim	&_rotl128	(@T,15,13,@T[2],@T[3]);	# KL<<<(45+15=60)
651238384Sjkim	&_rotl128	(@T,17,16,@T);		# KL<<<(60+17=77)
652238384Sjkim	&_rotl128	(@T,17,18,@T);		# KL<<<(77+17=94)
653238384Sjkim	&_rotl128	(@T,17,22,@T);		# KL<<<(94+17=111)
654238384Sjkim
655238384Sjkim	while (@T[0] ne "eax")			# restore order
656238384Sjkim	{   unshift	(@T,pop(@T));   }
657238384Sjkim
658238384Sjkim	&mov	("eax",3);			# 3 grandRounds
659238384Sjkim	&jmp	(&label("done"));
660238384Sjkim
661238384Sjkim&set_label("2nd256",16);
662238384Sjkim	&mov	($idx,&wparam(2));
663238384Sjkim	&_saveround	(6,$idx,@T);		# temporary storage for KA!
664238384Sjkim
665238384Sjkim	&xor	(@T[0],&DWP(4*8+0,$idx));	# KA^KR
666238384Sjkim	&xor	(@T[1],&DWP(4*8+4,$idx));
667238384Sjkim	&xor	(@T[2],&DWP(5*8+0,$idx));
668238384Sjkim	&xor	(@T[3],&DWP(5*8+4,$idx));
669238384Sjkim
670238384Sjkim	&mov	($idx,&DWP($step*8,$key));	# prefetch SIGMA[8]
671238384Sjkim	&mov	(&swtmp(0),@T[0]);		# save s[0-3]
672238384Sjkim	&mov	(&swtmp(1),@T[1]);
673238384Sjkim	&mov	(&swtmp(2),@T[2]);
674238384Sjkim	&mov	(&swtmp(3),@T[3]);
675238384Sjkim	&Camellia_Feistel($step++);
676238384Sjkim	&Camellia_Feistel($step++);
677238384Sjkim	&mov	(@T[2],&swtmp(2));
678238384Sjkim	&mov	(@T[3],&swtmp(3));
679238384Sjkim
680238384Sjkim	&mov	($key,&wparam(2));
681238384Sjkim	&lea	($key,&DWP(128,$key));		# size optimization
682238384Sjkim
683238384Sjkim	####### process KB
684238384Sjkim	&_saveround	(2,$key,-128,@T);	# KB<<<0
685238384Sjkim	&_rotl128	(@T,30,10,@T);		# KB<<<30
686238384Sjkim	&_rotl128	(@T,30,20,@T);		# KB<<<(30+30=60)
687238384Sjkim	push		(@T,shift(@T));		# rotl128(@T,32);
688238384Sjkim	&_rotl128	(@T,19,32,@T);		# KB<<<(60+32+19=111)
689238384Sjkim
690238384Sjkim	####### process KR
691238384Sjkim	&_loadround	(4,$key,-128,@T);	# load KR
692238384Sjkim	&_rotl128	(@T,15,4,@T);		# KR<<<15
693238384Sjkim	&_rotl128	(@T,15,8,@T);		# KR<<<(15+15=30)
694238384Sjkim	&_rotl128	(@T,30,18,@T);		# KR<<<(30+30=60)
695238384Sjkim	push		(@T,shift(@T));		# rotl128(@T,32);
696238384Sjkim	&_rotl128	(@T,2,26,@T);		# KR<<<(60+32+2=94)
697238384Sjkim
698238384Sjkim	####### process KA
699238384Sjkim	&_loadround	(6,$key,-128,@T);	# load KA
700238384Sjkim	&_rotl128	(@T,15,6,@T);		# KA<<<15
701238384Sjkim	&_rotl128	(@T,30,14,@T);		# KA<<<(15+30=45)
702238384Sjkim	push		(@T,shift(@T));		# rotl128(@T,32);
703238384Sjkim	&_rotl128	(@T,0,24,@T);		# KA<<<(45+32+0=77)
704238384Sjkim	&_rotl128	(@T,17,28,@T);		# KA<<<(77+17=94)
705238384Sjkim
706238384Sjkim	####### process KL
707238384Sjkim	&_loadround	(0,$key,-128,@T);	# load KL
708238384Sjkim	push		(@T,shift(@T));		# rotl128(@T,32);
709238384Sjkim	&_rotl128	(@T,13,12,@T);		# KL<<<(32+13=45)
710238384Sjkim	&_rotl128	(@T,15,16,@T);		# KL<<<(45+15=60)
711238384Sjkim	&_rotl128	(@T,17,22,@T);		# KL<<<(60+17=77)
712238384Sjkim	push		(@T,shift(@T));		# rotl128(@T,32);
713238384Sjkim	&_rotl128	(@T,2,30,@T);		# KL<<<(77+32+2=111)
714238384Sjkim
715238384Sjkim	while (@T[0] ne "eax")			# restore order
716238384Sjkim	{   unshift	(@T,pop(@T));   }
717238384Sjkim
718238384Sjkim	&mov	("eax",4);			# 4 grandRounds
719238384Sjkim&set_label("done");
720238384Sjkim	&lea	("edx",&DWP(272-128,$key));	# end of key schedule
721238384Sjkim	&stack_pop(4);
722238384Sjkim}
723238384Sjkim&function_end("Camellia_Ekeygen");
724238384Sjkim
725238384Sjkimif ($OPENSSL) {
726238384Sjkim# int private_Camellia_set_key (
727238384Sjkim#		const unsigned char *userKey,
728238384Sjkim#		int bits,
729238384Sjkim#		CAMELLIA_KEY *key)
730238384Sjkim&function_begin_B("private_Camellia_set_key");
731238384Sjkim	&push	("ebx");
732238384Sjkim	&mov	("ecx",&wparam(0));	# pull arguments
733238384Sjkim	&mov	("ebx",&wparam(1));
734238384Sjkim	&mov	("edx",&wparam(2));
735238384Sjkim
736238384Sjkim	&mov	("eax",-1);
737238384Sjkim	&test	("ecx","ecx");
738238384Sjkim	&jz	(&label("done"));	# userKey==NULL?
739238384Sjkim	&test	("edx","edx");
740238384Sjkim	&jz	(&label("done"));	# key==NULL?
741238384Sjkim
742238384Sjkim	&mov	("eax",-2);
743238384Sjkim	&cmp	("ebx",256);
744238384Sjkim	&je	(&label("arg_ok"));	# bits==256?
745238384Sjkim	&cmp	("ebx",192);
746238384Sjkim	&je	(&label("arg_ok"));	# bits==192?
747238384Sjkim	&cmp	("ebx",128);
748238384Sjkim	&jne	(&label("done"));	# bits!=128?
749238384Sjkim&set_label("arg_ok",4);
750238384Sjkim
751238384Sjkim	&push	("edx");		# push arguments
752238384Sjkim	&push	("ecx");
753238384Sjkim	&push	("ebx");
754238384Sjkim	&call	("Camellia_Ekeygen");
755238384Sjkim	&stack_pop(3);
756238384Sjkim
757238384Sjkim	# eax holds grandRounds and edx points at where to put it
758238384Sjkim	&mov	(&DWP(0,"edx"),"eax");
759238384Sjkim	&xor	("eax","eax");
760238384Sjkim&set_label("done",4);
761238384Sjkim	&pop	("ebx");
762238384Sjkim	&ret	();
763238384Sjkim&function_end_B("private_Camellia_set_key");
764238384Sjkim}
765238384Sjkim
766238384Sjkim@SBOX=(
767238384Sjkim112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
768238384Sjkim 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
769238384Sjkim134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
770238384Sjkim166,225, 57,202,213, 71, 93, 61,217,  1, 90,214, 81, 86,108, 77,
771238384Sjkim139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
772238384Sjkim223, 76,203,194, 52,126,118,  5,109,183,169, 49,209, 23,  4,215,
773238384Sjkim 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
774238384Sjkim254, 68,207,178,195,181,122,145, 36,  8,232,168, 96,252,105, 80,
775238384Sjkim170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
776238384Sjkim 16,196,  0, 72,163,247,117,219,138,  3,230,218,  9, 63,221,148,
777238384Sjkim135, 92,131,  2,205, 74,144, 51,115,103,246,243,157,127,191,226,
778238384Sjkim 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
779238384Sjkim233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
780238384Sjkim120,152,  6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
781238384Sjkim114,  7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
782238384Sjkim 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
783238384Sjkim
784238384Sjkimsub S1110 { my $i=shift; $i=@SBOX[$i]; return $i<<24|$i<<16|$i<<8; }
785238384Sjkimsub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; return $i<<24|$i<<16|$i; }
786238384Sjkimsub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; return $i<<16|$i<<8|$i; }
787238384Sjkimsub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; return $i<<24|$i<<8|$i; }
788238384Sjkim
789238384Sjkim&set_label("Camellia_SIGMA",64);
790238384Sjkim&data_word(
791238384Sjkim    0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2,
792238384Sjkim    0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c,
793238384Sjkim    0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd,
794238384Sjkim    0,          0,          0,          0);
795238384Sjkim&set_label("Camellia_SBOX",64);
796238384Sjkim# tables are interleaved, remember?
797238384Sjkimfor ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
798238384Sjkimfor ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
799238384Sjkim
800238384Sjkim# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
801238384Sjkim#			size_t length, const CAMELLIA_KEY *key,
802238384Sjkim#			unsigned char *ivp,const int enc);
803238384Sjkim{
804238384Sjkim# stack frame layout
805238384Sjkim#             -4(%esp)		# return address	 0(%esp)
806238384Sjkim#              0(%esp)		# s0			 4(%esp)
807238384Sjkim#              4(%esp)		# s1			 8(%esp)
808238384Sjkim#              8(%esp)		# s2			12(%esp)
809238384Sjkim#             12(%esp)		# s3			16(%esp)
810238384Sjkim#             16(%esp)		# end of key schedule	20(%esp)
811238384Sjkim#             20(%esp)		# %esp backup
812238384Sjkimmy $_inp=&DWP(24,"esp");	#copy of wparam(0)
813238384Sjkimmy $_out=&DWP(28,"esp");	#copy of wparam(1)
814238384Sjkimmy $_len=&DWP(32,"esp");	#copy of wparam(2)
815238384Sjkimmy $_key=&DWP(36,"esp");	#copy of wparam(3)
816238384Sjkimmy $_ivp=&DWP(40,"esp");	#copy of wparam(4)
817238384Sjkimmy $ivec=&DWP(44,"esp");	#ivec[16]
818238384Sjkimmy $_tmp=&DWP(44,"esp");	#volatile variable [yes, aliases with ivec]
819238384Sjkimmy ($s0,$s1,$s2,$s3) = @T;
820238384Sjkim
821238384Sjkim&function_begin("Camellia_cbc_encrypt");
822238384Sjkim	&mov	($s2 eq "ecx"? $s2 : "",&wparam(2));	# load len
823238384Sjkim	&cmp	($s2,0);
824238384Sjkim	&je	(&label("enc_out"));
825238384Sjkim
826238384Sjkim	&pushf	();
827238384Sjkim	&cld	();
828238384Sjkim
829238384Sjkim	&mov	($s0,&wparam(0));	# load inp
830238384Sjkim	&mov	($s1,&wparam(1));	# load out
831238384Sjkim	#&mov	($s2,&wparam(2));	# load len
832238384Sjkim	&mov	($s3,&wparam(3));	# load key
833238384Sjkim	&mov	($Tbl,&wparam(4));	# load ivp
834238384Sjkim
835238384Sjkim	# allocate aligned stack frame...
836238384Sjkim	&lea	($idx,&DWP(-64,"esp"));
837238384Sjkim	&and	($idx,-64);
838238384Sjkim
839238384Sjkim	# place stack frame just "above mod 1024" the key schedule
840238384Sjkim	# this ensures that cache associativity of 2 suffices
841238384Sjkim	&lea	($key,&DWP(-64-63,$s3));
842238384Sjkim	&sub	($key,$idx);
843238384Sjkim	&neg	($key);
844238384Sjkim	&and	($key,0x3C0);	# modulo 1024, but aligned to cache-line
845238384Sjkim	&sub	($idx,$key);
846238384Sjkim
847238384Sjkim	&mov	($key,&wparam(5));	# load enc
848238384Sjkim
849238384Sjkim	&exch	("esp",$idx);
850238384Sjkim	&add	("esp",4);		# reserve for return address!
851238384Sjkim	&mov	($_esp,$idx);		# save %esp
852238384Sjkim
853238384Sjkim	&mov	($_inp,$s0);		# save copy of inp
854238384Sjkim	&mov	($_out,$s1);		# save copy of out
855238384Sjkim	&mov	($_len,$s2);		# save copy of len
856238384Sjkim	&mov	($_key,$s3);		# save copy of key
857238384Sjkim	&mov	($_ivp,$Tbl);		# save copy of ivp
858238384Sjkim
859238384Sjkim	&call   (&label("pic_point"));	# make it PIC!
860238384Sjkim	&set_label("pic_point");
861238384Sjkim	&blindpop($Tbl);
862238384Sjkim	&lea    ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
863238384Sjkim
864238384Sjkim	&mov	($idx,32);
865238384Sjkim	&set_label("prefetch_sbox",4);
866238384Sjkim		&mov	($s0,&DWP(0,$Tbl));
867238384Sjkim		&mov	($s1,&DWP(32,$Tbl));
868238384Sjkim		&mov	($s2,&DWP(64,$Tbl));
869238384Sjkim		&mov	($s3,&DWP(96,$Tbl));
870238384Sjkim		&lea	($Tbl,&DWP(128,$Tbl));
871238384Sjkim		&dec	($idx);
872238384Sjkim	&jnz	(&label("prefetch_sbox"));
873238384Sjkim	&mov	($s0,$_key);
874238384Sjkim	&sub	($Tbl,4096);
875238384Sjkim	&mov	($idx,$_inp);
876238384Sjkim	&mov	($s3,&DWP(272,$s0));		# load grandRounds
877238384Sjkim
878238384Sjkim	&cmp	($key,0);
879238384Sjkim	&je	(&label("DECRYPT"));
880238384Sjkim
881238384Sjkim	&mov	($s2,$_len);
882238384Sjkim	&mov	($key,$_ivp);
883238384Sjkim	&shl	($s3,6);
884238384Sjkim	&lea	($s3,&DWP(0,$s0,$s3));
885238384Sjkim	&mov	($_end,$s3);
886238384Sjkim
887238384Sjkim	&test	($s2,0xFFFFFFF0);
888238384Sjkim	&jz	(&label("enc_tail"));		# short input...
889238384Sjkim
890238384Sjkim	&mov	($s0,&DWP(0,$key));		# load iv
891238384Sjkim	&mov	($s1,&DWP(4,$key));
892238384Sjkim
893238384Sjkim	&set_label("enc_loop",4);
894238384Sjkim		&mov	($s2,&DWP(8,$key));
895238384Sjkim		&mov	($s3,&DWP(12,$key));
896238384Sjkim
897238384Sjkim		&xor	($s0,&DWP(0,$idx));	# xor input data
898238384Sjkim		&xor	($s1,&DWP(4,$idx));
899238384Sjkim		&xor	($s2,&DWP(8,$idx));
900238384Sjkim		&bswap	($s0);
901238384Sjkim		&xor	($s3,&DWP(12,$idx));
902238384Sjkim		&bswap	($s1);
903238384Sjkim		&mov	($key,$_key);		# load key
904238384Sjkim		&bswap	($s2);
905238384Sjkim		&bswap	($s3);
906238384Sjkim
907238384Sjkim		&call	("_x86_Camellia_encrypt");
908238384Sjkim
909238384Sjkim		&mov	($idx,$_inp);		# load inp
910238384Sjkim		&mov	($key,$_out);		# load out
911238384Sjkim
912238384Sjkim		&bswap	($s0);
913238384Sjkim		&bswap	($s1);
914238384Sjkim		&bswap	($s2);
915238384Sjkim		&mov	(&DWP(0,$key),$s0);	# save output data
916238384Sjkim		&bswap	($s3);
917238384Sjkim		&mov	(&DWP(4,$key),$s1);
918238384Sjkim		&mov	(&DWP(8,$key),$s2);
919238384Sjkim		&mov	(&DWP(12,$key),$s3);
920238384Sjkim
921238384Sjkim		&mov	($s2,$_len);		# load len
922238384Sjkim
923238384Sjkim		&lea	($idx,&DWP(16,$idx));
924238384Sjkim		&mov	($_inp,$idx);		# save inp
925238384Sjkim
926238384Sjkim		&lea	($s3,&DWP(16,$key));
927238384Sjkim		&mov	($_out,$s3);		# save out
928238384Sjkim
929238384Sjkim		&sub	($s2,16);
930238384Sjkim		&test	($s2,0xFFFFFFF0);
931238384Sjkim		&mov	($_len,$s2);		# save len
932238384Sjkim	&jnz	(&label("enc_loop"));
933238384Sjkim	&test	($s2,15);
934238384Sjkim	&jnz	(&label("enc_tail"));
935238384Sjkim	&mov	($idx,$_ivp);		# load ivp
936238384Sjkim	&mov	($s2,&DWP(8,$key));	# restore last dwords
937238384Sjkim	&mov	($s3,&DWP(12,$key));
938238384Sjkim	&mov	(&DWP(0,$idx),$s0);	# save ivec
939238384Sjkim	&mov	(&DWP(4,$idx),$s1);
940238384Sjkim	&mov	(&DWP(8,$idx),$s2);
941238384Sjkim	&mov	(&DWP(12,$idx),$s3);
942238384Sjkim
943238384Sjkim	&mov	("esp",$_esp);
944238384Sjkim	&popf	();
945238384Sjkim    &set_label("enc_out");
946238384Sjkim	&function_end_A();
947238384Sjkim	&pushf	();			# kludge, never executed
948238384Sjkim
949238384Sjkim    &set_label("enc_tail",4);
950238384Sjkim	&mov	($s0,$key eq "edi" ? $key : "");
951238384Sjkim	&mov	($key,$_out);			# load out
952238384Sjkim	&push	($s0);				# push ivp
953238384Sjkim	&mov	($s1,16);
954238384Sjkim	&sub	($s1,$s2);
955238384Sjkim	&cmp	($key,$idx);			# compare with inp
956238384Sjkim	&je	(&label("enc_in_place"));
957238384Sjkim	&align	(4);
958238384Sjkim	&data_word(0xA4F3F689);	# rep movsb	# copy input
959238384Sjkim	&jmp	(&label("enc_skip_in_place"));
960238384Sjkim    &set_label("enc_in_place");
961238384Sjkim	&lea	($key,&DWP(0,$key,$s2));
962238384Sjkim    &set_label("enc_skip_in_place");
963238384Sjkim	&mov	($s2,$s1);
964238384Sjkim	&xor	($s0,$s0);
965238384Sjkim	&align	(4);
966238384Sjkim	&data_word(0xAAF3F689);	# rep stosb	# zero tail
967238384Sjkim	&pop	($key);				# pop ivp
968238384Sjkim
969238384Sjkim	&mov	($idx,$_out);			# output as input
970238384Sjkim	&mov	($s0,&DWP(0,$key));
971238384Sjkim	&mov	($s1,&DWP(4,$key));
972238384Sjkim	&mov	($_len,16);			# len=16
973238384Sjkim	&jmp	(&label("enc_loop"));		# one more spin...
974238384Sjkim
975238384Sjkim#----------------------------- DECRYPT -----------------------------#
976238384Sjkim&set_label("DECRYPT",16);
977238384Sjkim	&shl	($s3,6);
978238384Sjkim	&lea	($s3,&DWP(0,$s0,$s3));
979238384Sjkim	&mov	($_end,$s0);
980238384Sjkim	&mov	($_key,$s3);
981238384Sjkim
982238384Sjkim	&cmp	($idx,$_out);
983238384Sjkim	&je	(&label("dec_in_place"));	# in-place processing...
984238384Sjkim
985238384Sjkim	&mov	($key,$_ivp);			# load ivp
986238384Sjkim	&mov	($_tmp,$key);
987238384Sjkim
988238384Sjkim	&set_label("dec_loop",4);
989238384Sjkim		&mov	($s0,&DWP(0,$idx));	# read input
990238384Sjkim		&mov	($s1,&DWP(4,$idx));
991238384Sjkim		&mov	($s2,&DWP(8,$idx));
992238384Sjkim		&bswap	($s0);
993238384Sjkim		&mov	($s3,&DWP(12,$idx));
994238384Sjkim		&bswap	($s1);
995238384Sjkim		&mov	($key,$_key);		# load key
996238384Sjkim		&bswap	($s2);
997238384Sjkim		&bswap	($s3);
998238384Sjkim
999238384Sjkim		&call	("_x86_Camellia_decrypt");
1000238384Sjkim
1001238384Sjkim		&mov	($key,$_tmp);		# load ivp
1002238384Sjkim		&mov	($idx,$_len);		# load len
1003238384Sjkim
1004238384Sjkim		&bswap	($s0);
1005238384Sjkim		&bswap	($s1);
1006238384Sjkim		&bswap	($s2);
1007238384Sjkim		&xor	($s0,&DWP(0,$key));	# xor iv
1008238384Sjkim		&bswap	($s3);
1009238384Sjkim		&xor	($s1,&DWP(4,$key));
1010238384Sjkim		&xor	($s2,&DWP(8,$key));
1011238384Sjkim		&xor	($s3,&DWP(12,$key));
1012238384Sjkim
1013238384Sjkim		&sub	($idx,16);
1014238384Sjkim		&jc	(&label("dec_partial"));
1015238384Sjkim		&mov	($_len,$idx);		# save len
1016238384Sjkim		&mov	($idx,$_inp);		# load inp
1017238384Sjkim		&mov	($key,$_out);		# load out
1018238384Sjkim
1019238384Sjkim		&mov	(&DWP(0,$key),$s0);	# write output
1020238384Sjkim		&mov	(&DWP(4,$key),$s1);
1021238384Sjkim		&mov	(&DWP(8,$key),$s2);
1022238384Sjkim		&mov	(&DWP(12,$key),$s3);
1023238384Sjkim
1024238384Sjkim		&mov	($_tmp,$idx);		# save ivp
1025238384Sjkim		&lea	($idx,&DWP(16,$idx));
1026238384Sjkim		&mov	($_inp,$idx);		# save inp
1027238384Sjkim
1028238384Sjkim		&lea	($key,&DWP(16,$key));
1029238384Sjkim		&mov	($_out,$key);		# save out
1030238384Sjkim
1031238384Sjkim	&jnz	(&label("dec_loop"));
1032238384Sjkim	&mov	($key,$_tmp);		# load temp ivp
1033238384Sjkim    &set_label("dec_end");
1034238384Sjkim	&mov	($idx,$_ivp);		# load user ivp
1035238384Sjkim	&mov	($s0,&DWP(0,$key));	# load iv
1036238384Sjkim	&mov	($s1,&DWP(4,$key));
1037238384Sjkim	&mov	($s2,&DWP(8,$key));
1038238384Sjkim	&mov	($s3,&DWP(12,$key));
1039238384Sjkim	&mov	(&DWP(0,$idx),$s0);	# copy back to user
1040238384Sjkim	&mov	(&DWP(4,$idx),$s1);
1041238384Sjkim	&mov	(&DWP(8,$idx),$s2);
1042238384Sjkim	&mov	(&DWP(12,$idx),$s3);
1043238384Sjkim	&jmp	(&label("dec_out"));
1044238384Sjkim
1045238384Sjkim    &set_label("dec_partial",4);
1046238384Sjkim	&lea	($key,$ivec);
1047238384Sjkim	&mov	(&DWP(0,$key),$s0);	# dump output to stack
1048238384Sjkim	&mov	(&DWP(4,$key),$s1);
1049238384Sjkim	&mov	(&DWP(8,$key),$s2);
1050238384Sjkim	&mov	(&DWP(12,$key),$s3);
1051238384Sjkim	&lea	($s2 eq "ecx" ? $s2 : "",&DWP(16,$idx));
1052238384Sjkim	&mov	($idx eq "esi" ? $idx : "",$key);
1053238384Sjkim	&mov	($key eq "edi" ? $key : "",$_out);	# load out
1054238384Sjkim	&data_word(0xA4F3F689);	# rep movsb		# copy output
1055238384Sjkim	&mov	($key,$_inp);				# use inp as temp ivp
1056238384Sjkim	&jmp	(&label("dec_end"));
1057238384Sjkim
1058238384Sjkim    &set_label("dec_in_place",4);
1059238384Sjkim	&set_label("dec_in_place_loop");
1060238384Sjkim		&lea	($key,$ivec);
1061238384Sjkim		&mov	($s0,&DWP(0,$idx));	# read input
1062238384Sjkim		&mov	($s1,&DWP(4,$idx));
1063238384Sjkim		&mov	($s2,&DWP(8,$idx));
1064238384Sjkim		&mov	($s3,&DWP(12,$idx));
1065238384Sjkim
1066238384Sjkim		&mov	(&DWP(0,$key),$s0);	# copy to temp
1067238384Sjkim		&mov	(&DWP(4,$key),$s1);
1068238384Sjkim		&mov	(&DWP(8,$key),$s2);
1069238384Sjkim		&bswap	($s0);
1070238384Sjkim		&mov	(&DWP(12,$key),$s3);
1071238384Sjkim		&bswap	($s1);
1072238384Sjkim		&mov	($key,$_key);		# load key
1073238384Sjkim		&bswap	($s2);
1074238384Sjkim		&bswap	($s3);
1075238384Sjkim
1076238384Sjkim		&call	("_x86_Camellia_decrypt");
1077238384Sjkim
1078238384Sjkim		&mov	($key,$_ivp);		# load ivp
1079238384Sjkim		&mov	($idx,$_out);		# load out
1080238384Sjkim
1081238384Sjkim		&bswap	($s0);
1082238384Sjkim		&bswap	($s1);
1083238384Sjkim		&bswap	($s2);
1084238384Sjkim		&xor	($s0,&DWP(0,$key));	# xor iv
1085238384Sjkim		&bswap	($s3);
1086238384Sjkim		&xor	($s1,&DWP(4,$key));
1087238384Sjkim		&xor	($s2,&DWP(8,$key));
1088238384Sjkim		&xor	($s3,&DWP(12,$key));
1089238384Sjkim
1090238384Sjkim		&mov	(&DWP(0,$idx),$s0);	# write output
1091238384Sjkim		&mov	(&DWP(4,$idx),$s1);
1092238384Sjkim		&mov	(&DWP(8,$idx),$s2);
1093238384Sjkim		&mov	(&DWP(12,$idx),$s3);
1094238384Sjkim
1095238384Sjkim		&lea	($idx,&DWP(16,$idx));
1096238384Sjkim		&mov	($_out,$idx);		# save out
1097238384Sjkim
1098238384Sjkim		&lea	($idx,$ivec);
1099238384Sjkim		&mov	($s0,&DWP(0,$idx));	# read temp
1100238384Sjkim		&mov	($s1,&DWP(4,$idx));
1101238384Sjkim		&mov	($s2,&DWP(8,$idx));
1102238384Sjkim		&mov	($s3,&DWP(12,$idx));
1103238384Sjkim
1104238384Sjkim		&mov	(&DWP(0,$key),$s0);	# copy iv
1105238384Sjkim		&mov	(&DWP(4,$key),$s1);
1106238384Sjkim		&mov	(&DWP(8,$key),$s2);
1107238384Sjkim		&mov	(&DWP(12,$key),$s3);
1108238384Sjkim
1109238384Sjkim		&mov	($idx,$_inp);		# load inp
1110238384Sjkim
1111238384Sjkim		&lea	($idx,&DWP(16,$idx));
1112238384Sjkim		&mov	($_inp,$idx);		# save inp
1113238384Sjkim
1114238384Sjkim		&mov	($s2,$_len);		# load len
1115238384Sjkim		&sub	($s2,16);
1116238384Sjkim		&jc	(&label("dec_in_place_partial"));
1117238384Sjkim		&mov	($_len,$s2);		# save len
1118238384Sjkim	&jnz	(&label("dec_in_place_loop"));
1119238384Sjkim	&jmp	(&label("dec_out"));
1120238384Sjkim
1121238384Sjkim    &set_label("dec_in_place_partial",4);
1122238384Sjkim	# one can argue if this is actually required...
1123238384Sjkim	&mov	($key eq "edi" ? $key : "",$_out);
1124238384Sjkim	&lea	($idx eq "esi" ? $idx : "",$ivec);
1125238384Sjkim	&lea	($key,&DWP(0,$key,$s2));
1126238384Sjkim	&lea	($idx,&DWP(16,$idx,$s2));
1127238384Sjkim	&neg	($s2 eq "ecx" ? $s2 : "");
1128238384Sjkim	&data_word(0xA4F3F689);	# rep movsb	# restore tail
1129238384Sjkim
1130238384Sjkim    &set_label("dec_out",4);
1131238384Sjkim    &mov	("esp",$_esp);
1132238384Sjkim    &popf	();
1133238384Sjkim&function_end("Camellia_cbc_encrypt");
1134238384Sjkim}
1135238384Sjkim
1136238384Sjkim&asciz("Camellia for x86 by <appro\@openssl.org>");
1137238384Sjkim
1138238384Sjkim&asm_finish();
1139